Files
PureRL-1.5B-v7-s2-l2-kl-w2-b2/trainer_state.json
ModelHub XC 368a22980b 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-l2-kl-w2-b2
Source: Original Platform
2026-06-04 14:49:15 +08:00

12243 lines
503 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.773959219455719,
"adv/mean_abs_reasoning": 0.47714588046073914,
"adv/mean_abs_step_conf": 0.7498364448547363,
"adv/ratio_final_to_reasoning": 1.622059942565935,
"adv/ratio_step_to_reasoning": 1.5715035496705603,
"adv/std_final_conf": 0.9294352531433105,
"adv/std_reasoning": 0.7393431663513184,
"adv/std_step_conf": 0.9352971315383911,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.38076182006817844,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.2003187250996017,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": -0.026059730250481805,
"calib/mean_conf": 0.8737051792828686,
"calib/mu_c": 0.865606936416185,
"calib/mu_w": 0.8916666666666668,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19239043824701207,
"calib/std_conf": 0.09027744273295583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7959393232205367,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.006446568895645877,
"calib/step_q_w": 0.8023858921161826,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 474.94921875,
"completions/mean_terminated_length": 478.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.04303989186882973,
"kl": 0.000291675329208374,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0136,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03466901555657387,
"mask/share_reasoning": 0.8340686559677124,
"mask/share_step_conf": 0.12344987690448761,
"num_tokens": 229171.0,
"reward": 1.0788748264312744,
"reward_std": 0.22853493690490723,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7142800688743591,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7420004606246948,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.7672724723815918,
"adv/mean_abs_reasoning": 0.5104547739028931,
"adv/mean_abs_step_conf": 0.770571768283844,
"adv/ratio_final_to_reasoning": 1.503115479781084,
"adv/ratio_step_to_reasoning": 1.509578923891962,
"adv/std_final_conf": 0.9330522418022156,
"adv/std_reasoning": 0.7575037479400635,
"adv/std_step_conf": 0.9354329705238342,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.44343065693430656,
"calib/avg_num_step_conf": 5.05859375,
"calib/ece": 0.3349411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": 0.002352468143016151,
"calib/mean_conf": 0.8721960784313726,
"calib/mu_c": 0.8732846715328467,
"calib/mu_w": 0.8709322033898306,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3349411764705883,
"calib/std_conf": 0.07627016470309335,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7954391371340525,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.011011892552009073,
"calib/step_q_w": 0.7844272445820434,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 492.9765625,
"completions/mean_terminated_length": 494.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.04042748734354973,
"kl": 0.00037539005279541016,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0158,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03364308178424835,
"mask/share_reasoning": 0.8523939251899719,
"mask/share_step_conf": 0.11005672812461853,
"num_tokens": 458661.0,
"reward": 1.016056776046753,
"reward_std": 0.2184845209121704,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6320762038230896,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7291916012763977,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.7565299868583679,
"adv/mean_abs_reasoning": 0.4385569989681244,
"adv/mean_abs_step_conf": 0.7518496513366699,
"adv/ratio_final_to_reasoning": 1.725043697030029,
"adv/ratio_step_to_reasoning": 1.7143715710972305,
"adv/std_final_conf": 0.930081844329834,
"adv/std_reasoning": 0.7205978035926819,
"adv/std_step_conf": 0.9347808957099915,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.44436368441918855,
"calib/avg_num_step_conf": 4.98828125,
"calib/ece": 0.2470588235294117,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.28627450980392155,
"calib/gap": -0.00972974758821199,
"calib/mean_conf": 0.8784313725490196,
"calib/mu_c": 0.8748447204968943,
"calib/mu_w": 0.8845744680851063,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2470588235294117,
"calib/std_conf": 0.04320369215429589,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7916621253405993,
"calib/step_q_c_n": 734.0,
"calib/step_q_gap": 0.024829712817578953,
"calib/step_q_w": 0.7668324125230204,
"calib/step_q_w_n": 543.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1604.0,
"completions/max_terminated_length": 1604.0,
"completions/mean_length": 491.1875,
"completions/mean_terminated_length": 493.11376953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.0032,
"grad_norm": 0.03560638055205345,
"kl": 0.00038504600524902344,
"learning_rate": 7.5e-07,
"loss": 0.0598,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.033207207918167114,
"mask/share_reasoning": 0.854034960269928,
"mask/share_step_conf": 0.1088515967130661,
"num_tokens": 689661.0,
"reward": 1.0690152645111084,
"reward_std": 0.21428318321704865,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6956777572631836,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7459434270858765,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.7650306224822998,
"adv/mean_abs_reasoning": 0.4247799217700958,
"adv/mean_abs_step_conf": 0.7311046123504639,
"adv/ratio_final_to_reasoning": 1.8010046691810409,
"adv/ratio_step_to_reasoning": 1.7211374052330104,
"adv/std_final_conf": 0.9282382726669312,
"adv/std_reasoning": 0.7012653350830078,
"adv/std_step_conf": 0.9353035092353821,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.48399945989738047,
"calib/avg_num_step_conf": 4.9765625,
"calib/ece": 0.248814229249012,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2490118577075099,
"calib/gap": -0.004751552795031144,
"calib/mean_conf": 0.8789328063241106,
"calib/mu_c": 0.8772049689440995,
"calib/mu_w": 0.8819565217391306,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24569169960474324,
"calib/std_conf": 0.04266849221724298,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7867173637515843,
"calib/step_q_c_n": 789.0,
"calib/step_q_gap": 0.00044932251447082905,
"calib/step_q_w": 0.7862680412371135,
"calib/step_q_w_n": 485.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2856.0,
"completions/max_terminated_length": 2856.0,
"completions/mean_length": 512.50390625,
"completions/mean_terminated_length": 514.5137329101562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.05675892159342766,
"kl": 0.00032141804695129395,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0046,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03311063349246979,
"mask/share_reasoning": 0.8511103391647339,
"mask/share_step_conf": 0.11187273263931274,
"num_tokens": 927030.0,
"reward": 1.0533726215362549,
"reward_std": 0.1900576800107956,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.697465181350708,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7238948345184326,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.7479941844940186,
"adv/mean_abs_reasoning": 0.4255312979221344,
"adv/mean_abs_step_conf": 0.7695996165275574,
"adv/ratio_final_to_reasoning": 1.7577888821491336,
"adv/ratio_step_to_reasoning": 1.808561721042625,
"adv/std_final_conf": 0.9310461282730103,
"adv/std_reasoning": 0.7012953758239746,
"adv/std_step_conf": 0.9351333379745483,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.48754801536491676,
"calib/avg_num_step_conf": 4.8984375,
"calib/ece": 0.32694444444444437,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.32936507936507936,
"calib/gap": -0.004367477592829627,
"calib/mean_conf": 0.8803571428571427,
"calib/mu_c": 0.8784507042253522,
"calib/mu_w": 0.8828181818181818,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.3219047619047618,
"calib/std_conf": 0.04535870513494929,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7951652892561983,
"calib/step_q_c_n": 726.0,
"calib/step_q_gap": 0.015695592286501414,
"calib/step_q_w": 0.7794696969696969,
"calib/step_q_w_n": 528.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1521.0,
"completions/max_terminated_length": 1521.0,
"completions/mean_length": 507.01171875,
"completions/mean_terminated_length": 509.0000305175781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.05365122854709625,
"kl": 0.00030153989791870117,
"learning_rate": 1.25e-06,
"loss": 0.0232,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03370092064142227,
"mask/share_reasoning": 0.84963059425354,
"mask/share_step_conf": 0.11276228725910187,
"num_tokens": 1163513.0,
"reward": 0.9934109449386597,
"reward_std": 0.19404548406600952,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6383934020996094,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.6942647695541382,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.7626480460166931,
"adv/mean_abs_reasoning": 0.3809635639190674,
"adv/mean_abs_step_conf": 0.7435075044631958,
"adv/ratio_final_to_reasoning": 2.0018923546681004,
"adv/ratio_step_to_reasoning": 1.9516499079716398,
"adv/std_final_conf": 0.9299441576004028,
"adv/std_reasoning": 0.6611693501472473,
"adv/std_step_conf": 0.9353631734848022,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4705513784461153,
"calib/avg_num_step_conf": 5.11328125,
"calib/ece": 0.330984251968504,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3188976377952756,
"calib/gap": -0.004665413533834539,
"calib/mean_conf": 0.8821653543307086,
"calib/mu_c": 0.8800714285714286,
"calib/mu_w": 0.8847368421052632,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.330984251968504,
"calib/std_conf": 0.04235717339790602,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8080547112462007,
"calib/step_q_c_n": 658.0,
"calib/step_q_gap": -0.0005935222407423835,
"calib/step_q_w": 0.808648233486943,
"calib/step_q_w_n": 651.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2289.0,
"completions/max_terminated_length": 2289.0,
"completions/mean_length": 443.55078125,
"completions/mean_terminated_length": 445.29022216796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.0064,
"grad_norm": 0.03383629024028778,
"kl": 0.0004996657371520996,
"learning_rate": 1.5e-06,
"loss": 0.0014,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03738969564437866,
"mask/share_reasoning": 0.8328449130058289,
"mask/share_step_conf": 0.12585915625095367,
"num_tokens": 1383014.0,
"reward": 0.9994688630104065,
"reward_std": 0.1811923086643219,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6339746117591858,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7047669291496277,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.7722004652023315,
"adv/mean_abs_reasoning": 0.5118991136550903,
"adv/mean_abs_step_conf": 0.7482302188873291,
"adv/ratio_final_to_reasoning": 1.5085012741839365,
"adv/ratio_step_to_reasoning": 1.4616751600618614,
"adv/std_final_conf": 0.9312074780464172,
"adv/std_reasoning": 0.7576223015785217,
"adv/std_step_conf": 0.9354740381240845,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.41784313725490196,
"calib/avg_num_step_conf": 5.8671875,
"calib/ece": 0.27996047430830046,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3359683794466403,
"calib/gap": -0.010246405228757971,
"calib/mean_conf": 0.884703557312253,
"calib/mu_c": 0.8806535947712419,
"calib/mu_w": 0.8908999999999999,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.27996047430830046,
"calib/std_conf": 0.04462858434574431,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7931849315068492,
"calib/step_q_c_n": 876.0,
"calib/step_q_gap": 0.00176640115541149,
"calib/step_q_w": 0.7914185303514377,
"calib/step_q_w_n": 626.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2721.0,
"completions/max_terminated_length": 2721.0,
"completions/mean_length": 569.71484375,
"completions/mean_terminated_length": 569.71484375,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.041968848556280136,
"kl": 0.0003533661365509033,
"learning_rate": 1.75e-06,
"loss": 0.0946,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03012840449810028,
"mask/share_reasoning": 0.8556883931159973,
"mask/share_step_conf": 0.11418319493532181,
"num_tokens": 1636285.0,
"reward": 1.0396735668182373,
"reward_std": 0.2362067699432373,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6677848100662231,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7295831441879272,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.7601855993270874,
"adv/mean_abs_reasoning": 0.4345535933971405,
"adv/mean_abs_step_conf": 0.7925187349319458,
"adv/ratio_final_to_reasoning": 1.7493483217669548,
"adv/ratio_step_to_reasoning": 1.8237537256024008,
"adv/std_final_conf": 0.9302745461463928,
"adv/std_reasoning": 0.7014146447181702,
"adv/std_step_conf": 0.9354472756385803,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.535615989515072,
"calib/avg_num_step_conf": 4.8203125,
"calib/ece": 0.31590361445783144,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.28112449799196787,
"calib/gap": 0.007483617300130918,
"calib/mean_conf": 0.8781526104417672,
"calib/mu_c": 0.8814285714285713,
"calib/mu_w": 0.8739449541284404,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.31590361445783144,
"calib/std_conf": 0.04891391507089884,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7990615384615386,
"calib/step_q_c_n": 650.0,
"calib/step_q_gap": 0.022982771338250885,
"calib/step_q_w": 0.7760787671232877,
"calib/step_q_w_n": 584.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3029.0,
"completions/max_terminated_length": 3029.0,
"completions/mean_length": 546.8984375,
"completions/mean_terminated_length": 546.8984375,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.04877391830086708,
"kl": 0.00037592649459838867,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0366,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.0320667028427124,
"mask/share_reasoning": 0.8620612025260925,
"mask/share_step_conf": 0.10587209463119507,
"num_tokens": 1882803.0,
"reward": 1.015367865562439,
"reward_std": 0.21866479516029358,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6367086172103882,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7272679805755615,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.8045555353164673,
"adv/mean_abs_reasoning": 0.4165237545967102,
"adv/mean_abs_step_conf": 0.7727015614509583,
"adv/ratio_final_to_reasoning": 1.9315958008096326,
"adv/ratio_step_to_reasoning": 1.8551200331878053,
"adv/std_final_conf": 0.9291321039199829,
"adv/std_reasoning": 0.681744396686554,
"adv/std_step_conf": 0.9353907108306885,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4443561734524992,
"calib/avg_num_step_conf": 5.1171875,
"calib/ece": 0.2555905511811023,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.29133858267716534,
"calib/gap": -0.00815094339622624,
"calib/mean_conf": 0.8788976377952756,
"calib/mu_c": 0.8758490566037735,
"calib/mu_w": 0.8839999999999998,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.25425196850393694,
"calib/std_conf": 0.04816684170764448,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7245669291338583,
"calib/step_q_c_n": 889.0,
"calib/step_q_gap": -0.05692950792077345,
"calib/step_q_w": 0.7814964370546318,
"calib/step_q_w_n": 421.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2582.0,
"completions/max_terminated_length": 2582.0,
"completions/mean_length": 537.3984375,
"completions/mean_terminated_length": 539.5059204101562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.0096,
"grad_norm": 0.04915522783994675,
"kl": 0.00032514333724975586,
"learning_rate": 2.25e-06,
"loss": -0.0041,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.033639971166849136,
"mask/share_reasoning": 0.8549748659133911,
"mask/share_step_conf": 0.10747894644737244,
"num_tokens": 2127913.0,
"reward": 1.02982497215271,
"reward_std": 0.21984942257404327,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.686966061592102,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7015809416770935,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.7565653324127197,
"adv/mean_abs_reasoning": 0.4656117558479309,
"adv/mean_abs_step_conf": 0.7444936037063599,
"adv/ratio_final_to_reasoning": 1.6248845157161678,
"adv/ratio_step_to_reasoning": 1.598957917955817,
"adv/std_final_conf": 0.9308657646179199,
"adv/std_reasoning": 0.7393183708190918,
"adv/std_step_conf": 0.9351793527603149,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5120526175213675,
"calib/avg_num_step_conf": 5.203125,
"calib/ece": 0.2644444444444444,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.3412698412698413,
"calib/gap": 0.004294871794871868,
"calib/mean_conf": 0.8834920634920634,
"calib/mu_c": 0.8851282051282052,
"calib/mu_w": 0.8808333333333334,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2644444444444444,
"calib/std_conf": 0.04569251024585318,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7845816733067729,
"calib/step_q_c_n": 753.0,
"calib/step_q_gap": -0.0060055460369231595,
"calib/step_q_w": 0.790587219343696,
"calib/step_q_w_n": 579.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2607.0,
"completions/max_terminated_length": 2607.0,
"completions/mean_length": 532.19921875,
"completions/mean_terminated_length": 532.19921875,
"completions/min_length": 163.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.043561290949583054,
"kl": 0.0004936754703521729,
"learning_rate": 2.5e-06,
"loss": 0.0907,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03227228671312332,
"mask/share_reasoning": 0.8546050786972046,
"mask/share_step_conf": 0.11312257498502731,
"num_tokens": 2370956.0,
"reward": 1.0441138744354248,
"reward_std": 0.2344008982181549,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6824515461921692,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7252049446105957,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.7800993919372559,
"adv/mean_abs_reasoning": 0.38626450300216675,
"adv/mean_abs_step_conf": 0.7973983287811279,
"adv/ratio_final_to_reasoning": 2.0195989687742024,
"adv/ratio_step_to_reasoning": 2.0643841786742048,
"adv/std_final_conf": 0.9288783669471741,
"adv/std_reasoning": 0.6612656116485596,
"adv/std_step_conf": 0.9352825284004211,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4638977212506624,
"calib/avg_num_step_conf": 5.32421875,
"calib/ece": 0.3072800000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.316,
"calib/gap": -0.007354266030736745,
"calib/mean_conf": 0.8849600000000001,
"calib/mu_c": 0.8819594594594594,
"calib/mu_w": 0.8893137254901962,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.30012000000000005,
"calib/std_conf": 0.048524204269621986,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7817042253521128,
"calib/step_q_c_n": 710.0,
"calib/step_q_gap": 0.025777732243383866,
"calib/step_q_w": 0.755926493108729,
"calib/step_q_w_n": 653.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2646.0,
"completions/max_terminated_length": 2646.0,
"completions/mean_length": 538.734375,
"completions/mean_terminated_length": 542.9763793945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.011733333333333333,
"grad_norm": 36.05608367919922,
"kl": 4.125522136688232,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.8144,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.032473787665367126,
"mask/share_reasoning": 0.8457951545715332,
"mask/share_step_conf": 0.11391851305961609,
"num_tokens": 2613352.0,
"reward": 1.0236568450927734,
"reward_std": 0.19265316426753998,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6501550674438477,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7246680855751038,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.7623910903930664,
"adv/mean_abs_reasoning": 0.4246925711631775,
"adv/mean_abs_step_conf": 0.7670254707336426,
"adv/ratio_final_to_reasoning": 1.7951599395887166,
"adv/ratio_step_to_reasoning": 1.8060722574752366,
"adv/std_final_conf": 0.9276826977729797,
"adv/std_reasoning": 0.7014279961585999,
"adv/std_step_conf": 0.9347568154335022,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5192449219899912,
"calib/avg_num_step_conf": 5.6484375,
"calib/ece": 0.20509960159362556,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.4063745019920319,
"calib/gap": 0.004512805416543975,
"calib/mean_conf": 0.8876494023904383,
"calib/mu_c": 0.8890697674418605,
"calib/mu_w": 0.8845569620253165,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.20374501992031877,
"calib/std_conf": 0.05237442380176764,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7802771362586606,
"calib/step_q_c_n": 866.0,
"calib/step_q_gap": 0.0217426535000399,
"calib/step_q_w": 0.7585344827586207,
"calib/step_q_w_n": 580.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2961.0,
"completions/max_terminated_length": 2961.0,
"completions/mean_length": 481.8671875,
"completions/mean_terminated_length": 485.6614074707031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.0128,
"grad_norm": 0.05654134973883629,
"kl": 0.0009008646011352539,
"learning_rate": 3e-06,
"loss": 0.0313,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03626004233956337,
"mask/share_reasoning": 0.8287768959999084,
"mask/share_step_conf": 0.1271505057811737,
"num_tokens": 2840886.0,
"reward": 1.1029002666473389,
"reward_std": 0.20242473483085632,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.7231832146644592,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7691406011581421,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.7640227675437927,
"adv/mean_abs_reasoning": 0.4206831753253937,
"adv/mean_abs_step_conf": 0.7589430212974548,
"adv/ratio_final_to_reasoning": 1.8161476673100363,
"adv/ratio_step_to_reasoning": 1.8040726746688194,
"adv/std_final_conf": 0.9303558468818665,
"adv/std_reasoning": 0.7012991309165955,
"adv/std_step_conf": 0.935462236404419,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6333466082570024,
"calib/avg_num_step_conf": 5.01953125,
"calib/ece": 0.25407843137254904,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.396078431372549,
"calib/gap": 0.022385503783353244,
"calib/mean_conf": 0.8861568627450982,
"calib/mu_c": 0.894320987654321,
"calib/mu_w": 0.8719354838709678,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.25247058823529417,
"calib/std_conf": 0.0559283058395228,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7778553299492384,
"calib/step_q_c_n": 788.0,
"calib/step_q_gap": 0.012020319888876307,
"calib/step_q_w": 0.7658350100603621,
"calib/step_q_w_n": 497.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1466.0,
"completions/max_terminated_length": 1466.0,
"completions/mean_length": 478.39453125,
"completions/mean_terminated_length": 480.2706298828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.055792853236198425,
"kl": 0.0011698007583618164,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0596,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.034841444343328476,
"mask/share_reasoning": 0.8473033905029297,
"mask/share_step_conf": 0.11394891887903214,
"num_tokens": 3067947.0,
"reward": 1.0867152214050293,
"reward_std": 0.1965121030807495,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.708683967590332,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7598310112953186,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.7682212591171265,
"adv/mean_abs_reasoning": 0.4448583722114563,
"adv/mean_abs_step_conf": 0.7434722185134888,
"adv/ratio_final_to_reasoning": 1.7268895160906734,
"adv/ratio_step_to_reasoning": 1.6712559883218094,
"adv/std_final_conf": 0.9279747605323792,
"adv/std_reasoning": 0.7206664681434631,
"adv/std_step_conf": 0.9349461197853088,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.36925911304118963,
"calib/avg_num_step_conf": 5.27734375,
"calib/ece": 0.3090438247011953,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.47410358565737054,
"calib/gap": -0.0182945124358469,
"calib/mean_conf": 0.9026693227091633,
"calib/mu_c": 0.895234899328859,
"calib/mu_w": 0.9135294117647059,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3090438247011953,
"calib/std_conf": 0.04100877147377484,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7693725992317542,
"calib/step_q_c_n": 781.0,
"calib/step_q_gap": 0.02968838870543833,
"calib/step_q_w": 0.7396842105263158,
"calib/step_q_w_n": 570.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2683.0,
"completions/max_terminated_length": 2683.0,
"completions/mean_length": 542.46484375,
"completions/mean_terminated_length": 544.5921630859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.0377659797668457,
"kl": 0.0017580986022949219,
"learning_rate": 3.5e-06,
"loss": 0.0361,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03264409676194191,
"mask/share_reasoning": 0.8481143712997437,
"mask/share_step_conf": 0.11533529311418533,
"num_tokens": 3312218.0,
"reward": 1.0354915857315063,
"reward_std": 0.2152160406112671,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6392582058906555,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7466707229614258,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.7421205043792725,
"adv/mean_abs_reasoning": 0.460532546043396,
"adv/mean_abs_step_conf": 0.7523643970489502,
"adv/ratio_final_to_reasoning": 1.6114398662051181,
"adv/ratio_step_to_reasoning": 1.6336834465072851,
"adv/std_final_conf": 0.9272037148475647,
"adv/std_reasoning": 0.7392576336860657,
"adv/std_step_conf": 0.9352161884307861,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5078076396665423,
"calib/avg_num_step_conf": 4.6875,
"calib/ece": 0.3491372549019608,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5490196078431373,
"calib/gap": 0.0012206047032473633,
"calib/mean_conf": 0.902078431372549,
"calib/mu_c": 0.9026241134751772,
"calib/mu_w": 0.9014035087719299,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3491372549019608,
"calib/std_conf": 0.04888555788868965,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.760141065830721,
"calib/step_q_c_n": 638.0,
"calib/step_q_gap": 0.0033083256171978492,
"calib/step_q_w": 0.7568327402135232,
"calib/step_q_w_n": 562.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1331.0,
"completions/max_terminated_length": 1331.0,
"completions/mean_length": 458.53125,
"completions/mean_terminated_length": 460.3294372558594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.016,
"grad_norm": 0.04049689695239067,
"kl": 0.0028886795043945312,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0327,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.035021279007196426,
"mask/share_reasoning": 0.8494170904159546,
"mask/share_step_conf": 0.11165538430213928,
"num_tokens": 3537482.0,
"reward": 1.0145270824432373,
"reward_std": 0.2300584316253662,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.622326135635376,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.732610285282135,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.7558896541595459,
"adv/mean_abs_reasoning": 0.3702784776687622,
"adv/mean_abs_step_conf": 0.7783836722373962,
"adv/ratio_final_to_reasoning": 2.041408560709644,
"adv/ratio_step_to_reasoning": 2.1021574819525704,
"adv/std_final_conf": 0.9276329874992371,
"adv/std_reasoning": 0.661292552947998,
"adv/std_step_conf": 0.9352163672447205,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5419096209912536,
"calib/avg_num_step_conf": 6.28515625,
"calib/ece": 0.3036507936507936,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6547619047619048,
"calib/gap": 0.009628942486085479,
"calib/mean_conf": 0.9147619047619048,
"calib/mu_c": 0.9185064935064934,
"calib/mu_w": 0.9088775510204079,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3036507936507936,
"calib/std_conf": 0.0450081877156951,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7208686868686869,
"calib/step_q_c_n": 990.0,
"calib/step_q_gap": -0.014010149964915786,
"calib/step_q_w": 0.7348788368336027,
"calib/step_q_w_n": 619.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2466.0,
"completions/max_terminated_length": 2466.0,
"completions/mean_length": 647.96484375,
"completions/mean_terminated_length": 647.96484375,
"completions/min_length": 202.0,
"completions/min_terminated_length": 202.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.0510561466217041,
"kl": 0.004111528396606445,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0373,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.026287715882062912,
"mask/share_reasoning": 0.8636431694030762,
"mask/share_step_conf": 0.110069140791893,
"num_tokens": 3812209.0,
"reward": 1.0580646991729736,
"reward_std": 0.1909414380788803,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6621820330619812,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7578397989273071,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.7609502077102661,
"adv/mean_abs_reasoning": 0.5324435234069824,
"adv/mean_abs_step_conf": 0.7632114887237549,
"adv/ratio_final_to_reasoning": 1.4291660509665371,
"adv/ratio_step_to_reasoning": 1.4334130385137223,
"adv/std_final_conf": 0.9293497204780579,
"adv/std_reasoning": 0.7753995060920715,
"adv/std_step_conf": 0.9355502724647522,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.44006581653640475,
"calib/avg_num_step_conf": 5.37890625,
"calib/ece": 0.1842063492063492,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6785714285714286,
"calib/gap": -0.00877828054298635,
"calib/mean_conf": 0.9157936507936508,
"calib/mu_c": 0.9135294117647059,
"calib/mu_w": 0.9223076923076923,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.17896825396825397,
"calib/std_conf": 0.05148359539317561,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.7065024630541873,
"calib/step_q_c_n": 1015.0,
"calib/step_q_gap": 0.007110197860817125,
"calib/step_q_w": 0.6993922651933702,
"calib/step_q_w_n": 362.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2442.0,
"completions/max_terminated_length": 2442.0,
"completions/mean_length": 498.37109375,
"completions/mean_terminated_length": 502.2952880859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.03440679609775543,
"kl": 0.006146430969238281,
"learning_rate": 4.25e-06,
"loss": 0.0597,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03338541090488434,
"mask/share_reasoning": 0.8393343687057495,
"mask/share_step_conf": 0.11946772038936615,
"num_tokens": 4043320.0,
"reward": 1.146507740020752,
"reward_std": 0.24300463497638702,
"rewards/accuracy_reward_step": 0.73046875,
"rewards/final_brier_reward_step": 0.7447628974914551,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8056056499481201,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.7621678113937378,
"adv/mean_abs_reasoning": 0.4120538830757141,
"adv/mean_abs_step_conf": 0.7573996782302856,
"adv/ratio_final_to_reasoning": 1.8496799634665522,
"adv/ratio_step_to_reasoning": 1.838108338105662,
"adv/std_final_conf": 0.9268068671226501,
"adv/std_reasoning": 0.6816505789756775,
"adv/std_step_conf": 0.9356744885444641,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5002542588354946,
"calib/avg_num_step_conf": 5.08203125,
"calib/ece": 0.37226190476190474,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.7261904761904762,
"calib/gap": 0.010118230358504676,
"calib/mean_conf": 0.9169444444444445,
"calib/mu_c": 0.9215217391304347,
"calib/mu_w": 0.91140350877193,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3707936507936508,
"calib/std_conf": 0.07787727479472598,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.716140350877193,
"calib/step_q_c_n": 627.0,
"calib/step_q_gap": 0.046362902805976325,
"calib/step_q_w": 0.6697774480712166,
"calib/step_q_w_n": 674.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2938.0,
"completions/max_terminated_length": 2938.0,
"completions/mean_length": 512.97265625,
"completions/mean_terminated_length": 514.9843139648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.0192,
"grad_norm": 0.03835965692996979,
"kl": 0.007046699523925781,
"learning_rate": 4.5e-06,
"loss": 0.0275,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.033493079245090485,
"mask/share_reasoning": 0.8562003374099731,
"mask/share_step_conf": 0.10640032589435577,
"num_tokens": 4285361.0,
"reward": 1.0004057884216309,
"reward_std": 0.21100187301635742,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.5974195599555969,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7335113286972046,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.7491005659103394,
"adv/mean_abs_reasoning": 0.3827943205833435,
"adv/mean_abs_step_conf": 0.7685535550117493,
"adv/ratio_final_to_reasoning": 1.956927064039975,
"adv/ratio_step_to_reasoning": 2.0077454488889597,
"adv/std_final_conf": 0.9226001501083374,
"adv/std_reasoning": 0.6612175703048706,
"adv/std_step_conf": 0.9356136322021484,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5578053142565151,
"calib/avg_num_step_conf": 4.5625,
"calib/ece": 0.31964705882352945,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.7686274509803922,
"calib/gap": 0.03273377618804252,
"calib/mean_conf": 0.9157254901960784,
"calib/mu_c": 0.9289473684210525,
"calib/mu_w": 0.89621359223301,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31964705882352945,
"calib/std_conf": 0.08228217314167494,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6997557471264367,
"calib/step_q_c_n": 696.0,
"calib/step_q_gap": 0.04212862848236898,
"calib/step_q_w": 0.6576271186440678,
"calib/step_q_w_n": 472.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1678.0,
"completions/max_terminated_length": 1678.0,
"completions/mean_length": 469.78125,
"completions/mean_terminated_length": 469.78125,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.035939835011959076,
"kl": 0.010945320129394531,
"learning_rate": 4.75e-06,
"loss": 0.0018,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03303908556699753,
"mask/share_reasoning": 0.8579931259155273,
"mask/share_step_conf": 0.10896774381399155,
"num_tokens": 4510385.0,
"reward": 1.0813804864883423,
"reward_std": 0.19825337827205658,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6634472608566284,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7875632047653198,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.7338466048240662,
"adv/mean_abs_reasoning": 0.39686164259910583,
"adv/mean_abs_step_conf": 0.7599142789840698,
"adv/ratio_final_to_reasoning": 1.8491245463229848,
"adv/ratio_step_to_reasoning": 1.9148090856230862,
"adv/std_final_conf": 0.9257152080535889,
"adv/std_reasoning": 0.7012358903884888,
"adv/std_step_conf": 0.9355400800704956,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4792622324159021,
"calib/avg_num_step_conf": 4.96484375,
"calib/ece": 0.3584189723320159,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7984189723320159,
"calib/gap": 0.007530581039755679,
"calib/mean_conf": 0.9275889328063242,
"calib/mu_c": 0.9308333333333335,
"calib/mu_w": 0.9233027522935778,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3584189723320159,
"calib/std_conf": 0.06596888861326226,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6546230440967283,
"calib/step_q_c_n": 703.0,
"calib/step_q_gap": 0.03510367789954527,
"calib/step_q_w": 0.6195193661971831,
"calib/step_q_w_n": 568.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2576.0,
"completions/max_terminated_length": 2576.0,
"completions/mean_length": 483.69140625,
"completions/mean_terminated_length": 483.69140625,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.027182994410395622,
"kl": 0.015130996704101562,
"learning_rate": 5e-06,
"loss": 0.0232,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03616435080766678,
"mask/share_reasoning": 0.8422824144363403,
"mask/share_step_conf": 0.1215532124042511,
"num_tokens": 4739082.0,
"reward": 1.054613709449768,
"reward_std": 0.19555900990962982,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6180988550186157,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7878357172012329,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.7652933597564697,
"adv/mean_abs_reasoning": 0.48477011919021606,
"adv/mean_abs_step_conf": 0.7567232847213745,
"adv/ratio_final_to_reasoning": 1.5786727140584769,
"adv/ratio_step_to_reasoning": 1.5609940769151416,
"adv/std_final_conf": 0.9191557765007019,
"adv/std_reasoning": 0.7392932772636414,
"adv/std_step_conf": 0.9357109069824219,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5639366001515534,
"calib/avg_num_step_conf": 5.0859375,
"calib/ece": 0.358627450980392,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.8705882352941177,
"calib/gap": 0.00974362212679969,
"calib/mean_conf": 0.9390196078431372,
"calib/mu_c": 0.9431081081081082,
"calib/mu_w": 0.9333644859813085,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.358627450980392,
"calib/std_conf": 0.03530566259980002,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6201876675603216,
"calib/step_q_c_n": 746.0,
"calib/step_q_gap": 0.015709250294134613,
"calib/step_q_w": 0.604478417266187,
"calib/step_q_w_n": 556.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2271.0,
"completions/max_terminated_length": 2271.0,
"completions/mean_length": 501.375,
"completions/mean_terminated_length": 501.375,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0224,
"grad_norm": 0.025243666023015976,
"kl": 0.017017364501953125,
"learning_rate": 4.9722222222222224e-06,
"loss": 0.0133,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03457701951265335,
"mask/share_reasoning": 0.8496730327606201,
"mask/share_step_conf": 0.11574994772672653,
"num_tokens": 4970394.0,
"reward": 1.075460433959961,
"reward_std": 0.21974530816078186,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6288824081420898,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8047963976860046,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.7464021444320679,
"adv/mean_abs_reasoning": 0.4103482961654663,
"adv/mean_abs_step_conf": 0.7724449038505554,
"adv/ratio_final_to_reasoning": 1.818947833844772,
"adv/ratio_step_to_reasoning": 1.8824128455478697,
"adv/std_final_conf": 0.9118995070457458,
"adv/std_reasoning": 0.6815477609634399,
"adv/std_step_conf": 0.9356730580329895,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4308304498269896,
"calib/avg_num_step_conf": 5.41015625,
"calib/ece": 0.27878431372549023,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9019607843137255,
"calib/gap": -0.006294117647058783,
"calib/mean_conf": 0.9454509803921568,
"calib/mu_c": 0.9433529411764706,
"calib/mu_w": 0.9496470588235294,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27878431372549023,
"calib/std_conf": 0.03242627458331524,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5930643402399127,
"calib/step_q_c_n": 917.0,
"calib/step_q_gap": -0.0035809589053864554,
"calib/step_q_w": 0.5966452991452992,
"calib/step_q_w_n": 468.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1832.0,
"completions/max_terminated_length": 1832.0,
"completions/mean_length": 460.67578125,
"completions/mean_terminated_length": 460.67578125,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.027960635721683502,
"kl": 0.024440765380859375,
"learning_rate": 4.944444444444445e-06,
"loss": 0.0032,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03475780785083771,
"mask/share_reasoning": 0.839755117893219,
"mask/share_step_conf": 0.1254870593547821,
"num_tokens": 5190143.0,
"reward": 1.1180078983306885,
"reward_std": 0.20199835300445557,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.6934887170791626,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8069971799850464,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.7499644160270691,
"adv/mean_abs_reasoning": 0.45654457807540894,
"adv/mean_abs_step_conf": 0.7715545296669006,
"adv/ratio_final_to_reasoning": 1.6426970158940208,
"adv/ratio_step_to_reasoning": 1.6899872799266067,
"adv/std_final_conf": 0.9265665411949158,
"adv/std_reasoning": 0.7392275333404541,
"adv/std_step_conf": 0.9358251690864563,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5026871641044869,
"calib/avg_num_step_conf": 5.07421875,
"calib/ece": 0.4441897233201581,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.8932806324110671,
"calib/gap": 0.0004255718035246492,
"calib/mean_conf": 0.9461660079051383,
"calib/mu_c": 0.9463779527559054,
"calib/mu_w": 0.9459523809523808,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4441897233201581,
"calib/std_conf": 0.034018190185690626,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6132223454833597,
"calib/step_q_c_n": 631.0,
"calib/step_q_gap": 0.02573731554323999,
"calib/step_q_w": 0.5874850299401198,
"calib/step_q_w_n": 668.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2450.0,
"completions/max_terminated_length": 2450.0,
"completions/mean_length": 496.140625,
"completions/mean_terminated_length": 496.140625,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.04829743877053261,
"kl": 0.04451179504394531,
"learning_rate": 4.9166666666666665e-06,
"loss": 0.0351,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03656258434057236,
"mask/share_reasoning": 0.8382663726806641,
"mask/share_step_conf": 0.12517108023166656,
"num_tokens": 5421091.0,
"reward": 0.9915996789932251,
"reward_std": 0.22186483442783356,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.5374960899353027,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7669271230697632,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.7683982849121094,
"adv/mean_abs_reasoning": 0.5511972904205322,
"adv/mean_abs_step_conf": 0.768555760383606,
"adv/ratio_final_to_reasoning": 1.3940530881889226,
"adv/ratio_step_to_reasoning": 1.394338785296353,
"adv/std_final_conf": 0.9279333353042603,
"adv/std_reasoning": 0.7928519248962402,
"adv/std_step_conf": 0.9359160661697388,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5019592476489028,
"calib/avg_num_step_conf": 5.828125,
"calib/ece": 0.4192741935483872,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9274193548387096,
"calib/gap": 0.0017450365726229267,
"calib/mean_conf": 0.9515322580645161,
"calib/mu_c": 0.9523484848484849,
"calib/mu_w": 0.950603448275862,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.4192741935483872,
"calib/std_conf": 0.028766141467029323,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5671428571428572,
"calib/step_q_c_n": 826.0,
"calib/step_q_gap": -0.004703989703989664,
"calib/step_q_w": 0.5718468468468468,
"calib/step_q_w_n": 666.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2861.0,
"completions/max_terminated_length": 2861.0,
"completions/mean_length": 577.125,
"completions/mean_terminated_length": 577.125,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.0256,
"grad_norm": 0.02429387718439102,
"kl": 0.023302078247070312,
"learning_rate": 4.888888888888889e-06,
"loss": 0.0614,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03134583681821823,
"mask/share_reasoning": 0.8475775718688965,
"mask/share_step_conf": 0.12107663601636887,
"num_tokens": 5673347.0,
"reward": 1.0050941705703735,
"reward_std": 0.25456106662750244,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.5573132634162903,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7706666588783264,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.7260433435440063,
"adv/mean_abs_reasoning": 0.38976427912712097,
"adv/mean_abs_step_conf": 0.762317419052124,
"adv/ratio_final_to_reasoning": 1.8627754836076411,
"adv/ratio_step_to_reasoning": 1.9558421843051847,
"adv/std_final_conf": 0.9239747524261475,
"adv/std_reasoning": 0.681587278842926,
"adv/std_step_conf": 0.9359145760536194,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5706342939860507,
"calib/avg_num_step_conf": 5.51953125,
"calib/ece": 0.3552191235059762,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9203187250996016,
"calib/gap": 0.0056507435188840605,
"calib/mean_conf": 0.9488446215139443,
"calib/mu_c": 0.9511409395973154,
"calib/mu_w": 0.9454901960784313,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3552191235059762,
"calib/std_conf": 0.03377715465001707,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5745710627400769,
"calib/step_q_c_n": 781.0,
"calib/step_q_gap": -0.0027074182725814744,
"calib/step_q_w": 0.5772784810126583,
"calib/step_q_w_n": 632.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2525.0,
"completions/max_terminated_length": 2525.0,
"completions/mean_length": 506.0859375,
"completions/mean_terminated_length": 508.07061767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.02284402586519718,
"kl": 0.029443740844726562,
"learning_rate": 4.861111111111111e-06,
"loss": 0.0858,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03279361128807068,
"mask/share_reasoning": 0.841393768787384,
"mask/share_step_conf": 0.1219063550233841,
"num_tokens": 5906129.0,
"reward": 1.0461409091949463,
"reward_std": 0.20731407403945923,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6217843294143677,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7719982862472534,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.7309408187866211,
"adv/mean_abs_reasoning": 0.47793784737586975,
"adv/mean_abs_step_conf": 0.76811283826828,
"adv/ratio_final_to_reasoning": 1.5293637505375872,
"adv/ratio_step_to_reasoning": 1.6071395945845754,
"adv/std_final_conf": 0.9263646602630615,
"adv/std_reasoning": 0.757454514503479,
"adv/std_step_conf": 0.9359714984893799,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5484908303616913,
"calib/avg_num_step_conf": 5.125,
"calib/ece": 0.35827450980392156,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9450980392156862,
"calib/gap": 0.005112710137544441,
"calib/mean_conf": 0.9504313725490194,
"calib/mu_c": 0.9525165562913906,
"calib/mu_w": 0.9474038461538462,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.35827450980392156,
"calib/std_conf": 0.02953579078561796,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.588168044077135,
"calib/step_q_c_n": 726.0,
"calib/step_q_gap": 0.03574483588600863,
"calib/step_q_w": 0.5524232081911263,
"calib/step_q_w_n": 586.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2184.0,
"completions/max_terminated_length": 2184.0,
"completions/mean_length": 489.28515625,
"completions/mean_terminated_length": 489.28515625,
"completions/min_length": 228.0,
"completions/min_terminated_length": 228.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.02680124156177044,
"kl": 0.03493499755859375,
"learning_rate": 4.833333333333333e-06,
"loss": 0.0069,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03132498264312744,
"mask/share_reasoning": 0.8564911484718323,
"mask/share_step_conf": 0.11218388378620148,
"num_tokens": 6136626.0,
"reward": 1.0738930702209473,
"reward_std": 0.22200658917427063,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6292617321014404,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8008912801742554,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.7657771110534668,
"adv/mean_abs_reasoning": 0.5099864602088928,
"adv/mean_abs_step_conf": 0.7635960578918457,
"adv/ratio_final_to_reasoning": 1.5015636115903959,
"adv/ratio_step_to_reasoning": 1.4972869232235562,
"adv/std_final_conf": 0.9304526448249817,
"adv/std_reasoning": 0.7575855255126953,
"adv/std_step_conf": 0.935930609703064,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4673167996011965,
"calib/avg_num_step_conf": 5.71484375,
"calib/ece": 0.4856692913385828,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.8858267716535433,
"calib/gap": -0.012193419740777589,
"calib/mean_conf": 0.9255118110236221,
"calib/mu_c": 0.9189830508474577,
"calib/mu_w": 0.9311764705882353,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4733070866141733,
"calib/std_conf": 0.13233344106566278,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.545643879173291,
"calib/step_q_c_n": 629.0,
"calib/step_q_gap": -0.018696648404646576,
"calib/step_q_w": 0.5643405275779376,
"calib/step_q_w_n": 834.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2514.0,
"completions/max_terminated_length": 2514.0,
"completions/mean_length": 501.94140625,
"completions/mean_terminated_length": 501.94140625,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.0288,
"grad_norm": 0.031133631244301796,
"kl": 0.03224945068359375,
"learning_rate": 4.805555555555556e-06,
"loss": 0.0273,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.032666970044374466,
"mask/share_reasoning": 0.844009518623352,
"mask/share_step_conf": 0.12332353740930557,
"num_tokens": 6370339.0,
"reward": 0.9932471513748169,
"reward_std": 0.23951643705368042,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.5111820101737976,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7897913455963135,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.7506414651870728,
"adv/mean_abs_reasoning": 0.4154139459133148,
"adv/mean_abs_step_conf": 0.747469425201416,
"adv/ratio_final_to_reasoning": 1.8069722323277766,
"adv/ratio_step_to_reasoning": 1.7993363789413845,
"adv/std_final_conf": 0.8933430314064026,
"adv/std_reasoning": 0.6817694306373596,
"adv/std_step_conf": 0.9356337189674377,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.539148305370258,
"calib/avg_num_step_conf": 5.14453125,
"calib/ece": 0.3367600000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.924,
"calib/gap": 0.01903038878781782,
"calib/mean_conf": 0.9487599999999999,
"calib/mu_c": 0.9561437908496734,
"calib/mu_w": 0.9371134020618556,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3367600000000001,
"calib/std_conf": 0.05744965099981025,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5563813229571984,
"calib/step_q_c_n": 771.0,
"calib/step_q_gap": 0.04348755006342553,
"calib/step_q_w": 0.5128937728937729,
"calib/step_q_w_n": 546.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2453.0,
"completions/max_terminated_length": 2453.0,
"completions/mean_length": 575.5234375,
"completions/mean_terminated_length": 575.5234375,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.03289984166622162,
"kl": 0.0348358154296875,
"learning_rate": 4.777777777777778e-06,
"loss": 0.014,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.030212881043553352,
"mask/share_reasoning": 0.8617681264877319,
"mask/share_step_conf": 0.10801897943019867,
"num_tokens": 6624617.0,
"reward": 1.0612573623657227,
"reward_std": 0.22830414772033691,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6390644311904907,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7801127433776855,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.760231614112854,
"adv/mean_abs_reasoning": 0.5034997463226318,
"adv/mean_abs_step_conf": 0.7915377616882324,
"adv/ratio_final_to_reasoning": 1.5098947311598323,
"adv/ratio_step_to_reasoning": 1.5720718182468199,
"adv/std_final_conf": 0.9243661165237427,
"adv/std_reasoning": 0.757530689239502,
"adv/std_step_conf": 0.9357538819313049,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5606481481481483,
"calib/avg_num_step_conf": 5.84375,
"calib/ece": 0.4767843137254901,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9098039215686274,
"calib/gap": 0.009055555555555816,
"calib/mean_conf": 0.9473725490196078,
"calib/mu_c": 0.9521666666666667,
"calib/mu_w": 0.9431111111111109,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4767843137254901,
"calib/std_conf": 0.0380061712142284,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5177503852080123,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.008317091229263829,
"calib/step_q_w": 0.5094332939787485,
"calib/step_q_w_n": 847.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2448.0,
"completions/max_terminated_length": 2448.0,
"completions/mean_length": 579.4765625,
"completions/mean_terminated_length": 579.4765625,
"completions/min_length": 182.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.025897309184074402,
"kl": 0.031734466552734375,
"learning_rate": 4.75e-06,
"loss": -0.0289,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.029150307178497314,
"mask/share_reasoning": 0.8561632037162781,
"mask/share_step_conf": 0.11468647420406342,
"num_tokens": 6880091.0,
"reward": 1.0111935138702393,
"reward_std": 0.22310924530029297,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.5168156623840332,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.809443473815918,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.7617216110229492,
"adv/mean_abs_reasoning": 0.5963394641876221,
"adv/mean_abs_step_conf": 0.7448407411575317,
"adv/ratio_final_to_reasoning": 1.2773288651299022,
"adv/ratio_step_to_reasoning": 1.2490213810890567,
"adv/std_final_conf": 0.9236946105957031,
"adv/std_reasoning": 0.8265910148620605,
"adv/std_step_conf": 0.9360052943229675,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5465801886792453,
"calib/avg_num_step_conf": 5.54296875,
"calib/ece": 0.3615199999999998,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.88,
"calib/gap": 0.02419287211740051,
"calib/mean_conf": 0.93752,
"calib/mu_c": 0.9477777777777779,
"calib/mu_w": 0.9235849056603774,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3615199999999998,
"calib/std_conf": 0.08375828078464839,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5081538461538462,
"calib/step_q_c_n": 780.0,
"calib/step_q_gap": 0.03649030937763337,
"calib/step_q_w": 0.4716635367762128,
"calib/step_q_w_n": 639.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2796.0,
"completions/max_terminated_length": 2796.0,
"completions/mean_length": 586.51953125,
"completions/mean_terminated_length": 586.51953125,
"completions/min_length": 211.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.032,
"grad_norm": 0.02579519897699356,
"kl": 0.032093048095703125,
"learning_rate": 4.722222222222222e-06,
"loss": 0.0179,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.028735563158988953,
"mask/share_reasoning": 0.8600101470947266,
"mask/share_step_conf": 0.1112542599439621,
"num_tokens": 7137224.0,
"reward": 1.061484932899475,
"reward_std": 0.2636525630950928,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6148117184638977,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.800751268863678,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.732903003692627,
"adv/mean_abs_reasoning": 0.3279497027397156,
"adv/mean_abs_step_conf": 0.758866548538208,
"adv/ratio_final_to_reasoning": 2.23480307367228,
"adv/ratio_step_to_reasoning": 2.313972362830586,
"adv/std_final_conf": 0.9095056056976318,
"adv/std_reasoning": 0.6187405586242676,
"adv/std_step_conf": 0.9357991814613342,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5910506094329624,
"calib/avg_num_step_conf": 6.1484375,
"calib/ece": 0.4873684210526316,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.8663967611336032,
"calib/gap": 0.03189785373608889,
"calib/mean_conf": 0.9367611336032389,
"calib/mu_c": 0.9543243243243243,
"calib/mu_w": 0.9224264705882355,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4873684210526316,
"calib/std_conf": 0.08962379552452024,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5029333333333333,
"calib/step_q_c_n": 600.0,
"calib/step_q_gap": 0.07074647501711151,
"calib/step_q_w": 0.43218685831622183,
"calib/step_q_w_n": 974.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2540.0,
"completions/max_terminated_length": 2540.0,
"completions/mean_length": 593.453125,
"completions/mean_terminated_length": 598.1259765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.03246685862541199,
"kl": 0.03501129150390625,
"learning_rate": 4.694444444444445e-06,
"loss": -0.0714,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.02911219745874405,
"mask/share_reasoning": 0.8506971597671509,
"mask/share_step_conf": 0.11237817257642746,
"num_tokens": 7395060.0,
"reward": 0.979390025138855,
"reward_std": 0.20983630418777466,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.5042523145675659,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.783747673034668,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.7484625577926636,
"adv/mean_abs_reasoning": 0.41868317127227783,
"adv/mean_abs_step_conf": 0.7483275532722473,
"adv/ratio_final_to_reasoning": 1.787658566543922,
"adv/ratio_step_to_reasoning": 1.787336116229031,
"adv/std_final_conf": 0.9268016815185547,
"adv/std_reasoning": 0.6816485524177551,
"adv/std_step_conf": 0.9360527992248535,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6109027336300064,
"calib/avg_num_step_conf": 5.62890625,
"calib/ece": 0.39816733067729093,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7569721115537849,
"calib/gap": 0.04753083280355985,
"calib/mean_conf": 0.911394422310757,
"calib/mu_c": 0.9343076923076922,
"calib/mu_w": 0.8867768595041323,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3958167330677292,
"calib/std_conf": 0.1273857890494772,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4802011494252874,
"calib/step_q_c_n": 696.0,
"calib/step_q_gap": 0.05791927023065652,
"calib/step_q_w": 0.4222818791946309,
"calib/step_q_w_n": 745.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2402.0,
"completions/max_terminated_length": 2402.0,
"completions/mean_length": 534.71484375,
"completions/mean_terminated_length": 541.0553588867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.028719009831547737,
"kl": 0.038970947265625,
"learning_rate": 4.666666666666667e-06,
"loss": -0.0573,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.030235659331083298,
"mask/share_reasoning": 0.8475057482719421,
"mask/share_step_conf": 0.11053981631994247,
"num_tokens": 7638651.0,
"reward": 1.043382167816162,
"reward_std": 0.2221207618713379,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.5912359356880188,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7985814809799194,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.7398884892463684,
"adv/mean_abs_reasoning": 0.4711639881134033,
"adv/mean_abs_step_conf": 0.7429441809654236,
"adv/ratio_final_to_reasoning": 1.5703417661629233,
"adv/ratio_step_to_reasoning": 1.576827176330391,
"adv/std_final_conf": 0.9138897061347961,
"adv/std_reasoning": 0.7392632365226746,
"adv/std_step_conf": 0.9357470273971558,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5652297220646625,
"calib/avg_num_step_conf": 5.671875,
"calib/ece": 0.4093253968253968,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.8293650793650794,
"calib/gap": 0.039725846095670425,
"calib/mean_conf": 0.9212301587301587,
"calib/mu_c": 0.9406201550387597,
"calib/mu_w": 0.9008943089430893,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4093253968253968,
"calib/std_conf": 0.10921288826629788,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5037347767253044,
"calib/step_q_c_n": 739.0,
"calib/step_q_gap": 0.017423416276496573,
"calib/step_q_w": 0.48631136044880785,
"calib/step_q_w_n": 713.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2534.0,
"completions/max_terminated_length": 2534.0,
"completions/mean_length": 552.71484375,
"completions/mean_terminated_length": 552.71484375,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.0352,
"grad_norm": 0.026177920401096344,
"kl": 0.03797149658203125,
"learning_rate": 4.638888888888889e-06,
"loss": 0.0651,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.030195150524377823,
"mask/share_reasoning": 0.8575115203857422,
"mask/share_step_conf": 0.11229334771633148,
"num_tokens": 7887018.0,
"reward": 1.0466405153274536,
"reward_std": 0.21856100857257843,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.5812917947769165,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.809555172920227,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.7491120100021362,
"adv/mean_abs_reasoning": 0.5132975578308105,
"adv/mean_abs_step_conf": 0.7715127468109131,
"adv/ratio_final_to_reasoning": 1.459410820436931,
"adv/ratio_step_to_reasoning": 1.5030516608559699,
"adv/std_final_conf": 0.928077220916748,
"adv/std_reasoning": 0.7575961947441101,
"adv/std_step_conf": 0.9354490637779236,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.49954850361197123,
"calib/avg_num_step_conf": 5.90625,
"calib/ece": 0.32480314960629925,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7047244094488189,
"calib/gap": 0.0014060887512901132,
"calib/mean_conf": 0.8845669291338583,
"calib/mu_c": 0.8851315789473684,
"calib/mu_w": 0.8837254901960783,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3054724409448819,
"calib/std_conf": 0.16156106524765057,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4707071960297767,
"calib/step_q_c_n": 806.0,
"calib/step_q_gap": 0.04542390991079648,
"calib/step_q_w": 0.4252832861189802,
"calib/step_q_w_n": 706.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2501.0,
"completions/max_terminated_length": 2501.0,
"completions/mean_length": 483.0546875,
"completions/mean_terminated_length": 484.94903564453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.0237668976187706,
"kl": 0.049846649169921875,
"learning_rate": 4.611111111111112e-06,
"loss": -0.0303,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03377118334174156,
"mask/share_reasoning": 0.8307417631149292,
"mask/share_step_conf": 0.13158085942268372,
"num_tokens": 8115792.0,
"reward": 1.1046805381774902,
"reward_std": 0.21319980919361115,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6436253786087036,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.832886278629303,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.768444299697876,
"adv/mean_abs_reasoning": 0.6530827283859253,
"adv/mean_abs_step_conf": 0.7854447364807129,
"adv/ratio_final_to_reasoning": 1.1766415896452558,
"adv/ratio_step_to_reasoning": 1.2026726513223163,
"adv/std_final_conf": 0.9365459084510803,
"adv/std_reasoning": 0.8746518492698669,
"adv/std_step_conf": 0.9357449412345886,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5909457364341085,
"calib/avg_num_step_conf": 5.0390625,
"calib/ece": 0.35590551181102364,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.562992125984252,
"calib/gap": 0.08058790697674423,
"calib/mean_conf": 0.8480314960629921,
"calib/mu_c": 0.8889600000000001,
"calib/mu_w": 0.8083720930232559,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.35590551181102364,
"calib/std_conf": 0.18002641700323216,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4165258215962442,
"calib/step_q_c_n": 639.0,
"calib/step_q_gap": -0.02817463923324892,
"calib/step_q_w": 0.4447004608294931,
"calib/step_q_w_n": 651.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2127.0,
"completions/max_terminated_length": 2127.0,
"completions/mean_length": 550.47265625,
"completions/mean_terminated_length": 550.47265625,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.031982872635126114,
"kl": 0.043140411376953125,
"learning_rate": 4.583333333333333e-06,
"loss": -0.0319,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.029373832046985626,
"mask/share_reasoning": 0.8712924718856812,
"mask/share_step_conf": 0.09933367371559143,
"num_tokens": 8365969.0,
"reward": 1.0584797859191895,
"reward_std": 0.2510361969470978,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.6224359273910522,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7994740009307861,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.7417552471160889,
"adv/mean_abs_reasoning": 0.40401333570480347,
"adv/mean_abs_step_conf": 0.7431538105010986,
"adv/ratio_final_to_reasoning": 1.8359672356411028,
"adv/ratio_step_to_reasoning": 1.8394289119309954,
"adv/std_final_conf": 0.9318743348121643,
"adv/std_reasoning": 0.7204494476318359,
"adv/std_step_conf": 0.9359287619590759,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6210376794258373,
"calib/avg_num_step_conf": 5.359375,
"calib/ece": 0.18619047619047624,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5515873015873016,
"calib/gap": 0.058914473684210655,
"calib/mean_conf": 0.8628571428571429,
"calib/mu_c": 0.8806250000000001,
"calib/mu_w": 0.8217105263157894,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17531746031746037,
"calib/std_conf": 0.1513581747120194,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4776652452025586,
"calib/step_q_c_n": 938.0,
"calib/step_q_gap": 0.03552238805970148,
"calib/step_q_w": 0.4421428571428571,
"calib/step_q_w_n": 434.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2142.0,
"completions/max_terminated_length": 2142.0,
"completions/mean_length": 500.69140625,
"completions/mean_terminated_length": 502.6549377441406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.0384,
"grad_norm": 0.04614810645580292,
"kl": 0.048313140869140625,
"learning_rate": 4.555555555555556e-06,
"loss": -0.0188,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03415819630026817,
"mask/share_reasoning": 0.8374646902084351,
"mask/share_step_conf": 0.12447094917297363,
"num_tokens": 8596858.0,
"reward": 1.1519205570220947,
"reward_std": 0.19022798538208008,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.7522937059402466,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8114482760429382,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.7809498310089111,
"adv/mean_abs_reasoning": 0.45968902111053467,
"adv/mean_abs_step_conf": 0.7330397367477417,
"adv/ratio_final_to_reasoning": 1.6988655267908337,
"adv/ratio_step_to_reasoning": 1.5946426890440752,
"adv/std_final_conf": 0.9347269535064697,
"adv/std_reasoning": 0.7394886016845703,
"adv/std_step_conf": 0.9361408352851868,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.618536280233528,
"calib/avg_num_step_conf": 5.35546875,
"calib/ece": 0.3527800829875518,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.4107883817427386,
"calib/gap": 0.08926675006950224,
"calib/mean_conf": 0.792116182572614,
"calib/mu_c": 0.8410091743119265,
"calib/mu_w": 0.7517424242424242,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.34630705394190864,
"calib/std_conf": 0.222647412558595,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.46208413001912046,
"calib/step_q_c_n": 523.0,
"calib/step_q_gap": 0.0653860168115733,
"calib/step_q_w": 0.39669811320754716,
"calib/step_q_w_n": 848.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3026.0,
"completions/max_terminated_length": 3026.0,
"completions/mean_length": 592.18359375,
"completions/mean_terminated_length": 594.5059204101562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.03609446808695793,
"kl": 0.041744232177734375,
"learning_rate": 4.527777777777778e-06,
"loss": 0.0311,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.03040693700313568,
"mask/share_reasoning": 0.854725182056427,
"mask/share_step_conf": 0.11096163839101791,
"num_tokens": 8855553.0,
"reward": 1.0080500841140747,
"reward_std": 0.2737266421318054,
"rewards/accuracy_reward_step": 0.42578125,
"rewards/final_brier_reward_step": 0.5899812579154968,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.7694959044456482,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.7715527415275574,
"adv/mean_abs_reasoning": 0.3854137659072876,
"adv/mean_abs_step_conf": 0.7639725804328918,
"adv/ratio_final_to_reasoning": 2.0018816393630234,
"adv/ratio_step_to_reasoning": 1.9822140463365485,
"adv/std_final_conf": 0.9348305463790894,
"adv/std_reasoning": 0.7012956142425537,
"adv/std_step_conf": 0.9358208179473877,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5929146537842189,
"calib/avg_num_step_conf": 5.22265625,
"calib/ece": 0.27399999999999997,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.432,
"calib/gap": 0.07270853462157834,
"calib/mean_conf": 0.80048,
"calib/mu_c": 0.833925925925926,
"calib/mu_w": 0.7612173913043476,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.26724,
"calib/std_conf": 0.20289447897860602,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4582369942196532,
"calib/step_q_c_n": 692.0,
"calib/step_q_gap": 0.024215908948335363,
"calib/step_q_w": 0.43402108527131783,
"calib/step_q_w_n": 645.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2579.0,
"completions/max_terminated_length": 2579.0,
"completions/mean_length": 525.8125,
"completions/mean_terminated_length": 527.87451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.028932897374033928,
"kl": 0.04598236083984375,
"learning_rate": 4.5e-06,
"loss": -0.01,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.031843964010477066,
"mask/share_reasoning": 0.8506335020065308,
"mask/share_step_conf": 0.1136162132024765,
"num_tokens": 9097049.0,
"reward": 1.0886796712875366,
"reward_std": 0.19496215879917145,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6627984642982483,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8091864585876465,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.7793285846710205,
"adv/mean_abs_reasoning": 0.46733757853507996,
"adv/mean_abs_step_conf": 0.7500290870666504,
"adv/ratio_final_to_reasoning": 1.6675923796111367,
"adv/ratio_step_to_reasoning": 1.604897875787557,
"adv/std_final_conf": 0.9338850378990173,
"adv/std_reasoning": 0.7205843329429626,
"adv/std_step_conf": 0.9357541799545288,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6334584115071921,
"calib/avg_num_step_conf": 5.57421875,
"calib/ece": 0.29735177865612655,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4268774703557312,
"calib/gap": 0.10142464040025001,
"calib/mean_conf": 0.7642292490118577,
"calib/mu_c": 0.8135384615384614,
"calib/mu_w": 0.7121138211382114,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2738735177865613,
"calib/std_conf": 0.2433444077939215,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4634579439252336,
"calib/step_q_c_n": 642.0,
"calib/step_q_gap": 0.0573815108042145,
"calib/step_q_w": 0.4060764331210191,
"calib/step_q_w_n": 785.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2701.0,
"completions/max_terminated_length": 2701.0,
"completions/mean_length": 513.671875,
"completions/mean_terminated_length": 513.671875,
"completions/min_length": 144.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.0416,
"grad_norm": 0.027603862807154655,
"kl": 0.04595947265625,
"learning_rate": 4.472222222222223e-06,
"loss": 0.0346,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03283419460058212,
"mask/share_reasoning": 0.8497879505157471,
"mask/share_step_conf": 0.1173778772354126,
"num_tokens": 9334637.0,
"reward": 1.10282564163208,
"reward_std": 0.19559051096439362,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6709941625595093,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8231047987937927,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.7777461409568787,
"adv/mean_abs_reasoning": 0.5213450193405151,
"adv/mean_abs_step_conf": 0.7524310350418091,
"adv/ratio_final_to_reasoning": 1.491806984059621,
"adv/ratio_step_to_reasoning": 1.4432496851962073,
"adv/std_final_conf": 0.9362614154815674,
"adv/std_reasoning": 0.7753744125366211,
"adv/std_step_conf": 0.9359205961227417,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6162286217303824,
"calib/avg_num_step_conf": 4.71484375,
"calib/ece": 0.31303149606299213,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.24803149606299213,
"calib/gap": 0.09046403420523152,
"calib/mean_conf": 0.7386220472440945,
"calib/mu_c": 0.7891964285714287,
"calib/mu_w": 0.6987323943661972,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30535433070866147,
"calib/std_conf": 0.22126175146989444,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.46174757281553397,
"calib/step_q_c_n": 515.0,
"calib/step_q_gap": 0.038727341601661136,
"calib/step_q_w": 0.42302023121387283,
"calib/step_q_w_n": 692.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1747.0,
"completions/max_terminated_length": 1747.0,
"completions/mean_length": 501.30859375,
"completions/mean_terminated_length": 503.2745361328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.037154071033000946,
"kl": 0.05255126953125,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0292,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.033608511090278625,
"mask/share_reasoning": 0.8529207706451416,
"mask/share_step_conf": 0.10956442356109619,
"num_tokens": 9569732.0,
"reward": 1.0853300094604492,
"reward_std": 0.20308294892311096,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.6553597450256348,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8195751905441284,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.7587120532989502,
"adv/mean_abs_reasoning": 0.5101221799850464,
"adv/mean_abs_step_conf": 0.7564547061920166,
"adv/ratio_final_to_reasoning": 1.4873143789223024,
"adv/ratio_step_to_reasoning": 1.4828892682419557,
"adv/std_final_conf": 0.9357188940048218,
"adv/std_reasoning": 0.7575936317443848,
"adv/std_step_conf": 0.9356542229652405,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6559618441971382,
"calib/avg_num_step_conf": 4.875,
"calib/ece": 0.12010988142292489,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.30039525691699603,
"calib/gap": 0.14302249602543704,
"calib/mean_conf": 0.7359375494071146,
"calib/mu_c": 0.7743783783783783,
"calib/mu_w": 0.6313558823529413,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06241106719367588,
"calib/std_conf": 0.2340747275470123,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.464314381270903,
"calib/step_q_c_n": 897.0,
"calib/step_q_gap": 0.040867087823609594,
"calib/step_q_w": 0.4234472934472934,
"calib/step_q_w_n": 351.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3062.0,
"completions/max_terminated_length": 3062.0,
"completions/mean_length": 479.34375,
"completions/mean_terminated_length": 479.34375,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.04717167466878891,
"kl": 0.10785293579101562,
"learning_rate": 4.416666666666667e-06,
"loss": -0.0067,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03561156243085861,
"mask/share_reasoning": 0.8464666604995728,
"mask/share_step_conf": 0.11792174726724625,
"num_tokens": 9799692.0,
"reward": 1.181377649307251,
"reward_std": 0.18686681985855103,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.7954376935958862,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8167534470558167,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.7862992286682129,
"adv/mean_abs_reasoning": 0.42737993597984314,
"adv/mean_abs_step_conf": 0.7851310968399048,
"adv/ratio_final_to_reasoning": 1.8398131556304453,
"adv/ratio_step_to_reasoning": 1.8370799158829358,
"adv/std_final_conf": 0.9355712532997131,
"adv/std_reasoning": 0.6816651225090027,
"adv/std_step_conf": 0.9355069994926453,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.597852564102564,
"calib/avg_num_step_conf": 5.4296875,
"calib/ece": 0.22527559055118113,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.31496062992125984,
"calib/gap": 0.07996923076923079,
"calib/mean_conf": 0.7464566929133858,
"calib/mu_c": 0.7792,
"calib/mu_w": 0.6992307692307692,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19059055118110238,
"calib/std_conf": 0.2410493057644755,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.46577608142493643,
"calib/step_q_c_n": 786.0,
"calib/step_q_gap": 0.02478270394149268,
"calib/step_q_w": 0.44099337748344375,
"calib/step_q_w_n": 604.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2698.0,
"completions/max_terminated_length": 2698.0,
"completions/mean_length": 441.1875,
"completions/mean_terminated_length": 442.91766357421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.0448,
"grad_norm": 0.030178029090166092,
"kl": 0.058757781982421875,
"learning_rate": 4.388888888888889e-06,
"loss": 0.0395,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03646181896328926,
"mask/share_reasoning": 0.8297966718673706,
"mask/share_step_conf": 0.12983526289463043,
"num_tokens": 10017004.0,
"reward": 1.1211426258087158,
"reward_std": 0.19580897688865662,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7046023607254028,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8157469630241394,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.7810938954353333,
"adv/mean_abs_reasoning": 0.5661770105361938,
"adv/mean_abs_step_conf": 0.7464275360107422,
"adv/ratio_final_to_reasoning": 1.3795930970344485,
"adv/ratio_step_to_reasoning": 1.318364260858708,
"adv/std_final_conf": 0.9360543489456177,
"adv/std_reasoning": 0.7928243279457092,
"adv/std_step_conf": 0.9359555244445801,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.626399749765405,
"calib/avg_num_step_conf": 4.9453125,
"calib/ece": 0.18007874015748032,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.24803149606299213,
"calib/gap": 0.11477760400375348,
"calib/mean_conf": 0.6747244094488188,
"calib/mu_c": 0.7266906474820143,
"calib/mu_w": 0.6119130434782608,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15377952755905513,
"calib/std_conf": 0.26266150418767414,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.44504098360655736,
"calib/step_q_c_n": 732.0,
"calib/step_q_gap": -0.009753023884079393,
"calib/step_q_w": 0.45479400749063675,
"calib/step_q_w_n": 534.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2058.0,
"completions/max_terminated_length": 2058.0,
"completions/mean_length": 506.80078125,
"completions/mean_terminated_length": 506.80078125,
"completions/min_length": 137.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.032175462692976,
"kl": 0.052890777587890625,
"learning_rate": 4.361111111111112e-06,
"loss": 0.0282,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.033847928047180176,
"mask/share_reasoning": 0.8536717891693115,
"mask/share_step_conf": 0.11248025298118591,
"num_tokens": 10251969.0,
"reward": 1.1071585416793823,
"reward_std": 0.2217944860458374,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.711056649684906,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.798527717590332,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.7781606912612915,
"adv/mean_abs_reasoning": 0.44944122433662415,
"adv/mean_abs_step_conf": 0.7437357306480408,
"adv/ratio_final_to_reasoning": 1.7313958958924112,
"adv/ratio_step_to_reasoning": 1.6548008735642703,
"adv/std_final_conf": 0.9344425201416016,
"adv/std_reasoning": 0.7206430435180664,
"adv/std_step_conf": 0.9358530044555664,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6591481854838711,
"calib/avg_num_step_conf": 5.36328125,
"calib/ece": 0.23027777777777772,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.23412698412698413,
"calib/gap": 0.11502016129032266,
"calib/mean_conf": 0.6952777777777778,
"calib/mu_c": 0.7518750000000001,
"calib/mu_w": 0.6368548387096774,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20880952380952378,
"calib/std_conf": 0.23684668334375034,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.47359237536656895,
"calib/step_q_c_n": 682.0,
"calib/step_q_gap": 0.05480800488900023,
"calib/step_q_w": 0.4187843704775687,
"calib/step_q_w_n": 691.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1866.0,
"completions/max_terminated_length": 1866.0,
"completions/mean_length": 530.0,
"completions/mean_terminated_length": 532.0784912109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.031568072736263275,
"kl": 0.045703887939453125,
"learning_rate": 4.333333333333334e-06,
"loss": -0.0645,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03036835603415966,
"mask/share_reasoning": 0.8552473783493042,
"mask/share_step_conf": 0.11047796905040741,
"num_tokens": 10493969.0,
"reward": 1.1189405918121338,
"reward_std": 0.19654977321624756,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.7051723003387451,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8238892555236816,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.7513673305511475,
"adv/mean_abs_reasoning": 0.46464505791664124,
"adv/mean_abs_step_conf": 0.7417216300964355,
"adv/ratio_final_to_reasoning": 1.6170780636731643,
"adv/ratio_step_to_reasoning": 1.5963187759321928,
"adv/std_final_conf": 0.9353601336479187,
"adv/std_reasoning": 0.75745689868927,
"adv/std_step_conf": 0.9355478882789612,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.670344387755102,
"calib/avg_num_step_conf": 5.6484375,
"calib/ece": 0.17829365079365092,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.2857142857142857,
"calib/gap": 0.15907142857142853,
"calib/mean_conf": 0.7176587301587302,
"calib/mu_c": 0.7883571428571429,
"calib/mu_w": 0.6292857142857143,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17019841269841282,
"calib/std_conf": 0.2480296853296732,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4704521963824289,
"calib/step_q_c_n": 774.0,
"calib/step_q_gap": 0.04230933923957181,
"calib/step_q_w": 0.4281428571428571,
"calib/step_q_w_n": 672.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2734.0,
"completions/max_terminated_length": 2734.0,
"completions/mean_length": 506.3046875,
"completions/mean_terminated_length": 508.29022216796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.048,
"grad_norm": 0.02938169613480568,
"kl": 0.047039031982421875,
"learning_rate": 4.305555555555556e-06,
"loss": 0.0387,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03402433916926384,
"mask/share_reasoning": 0.8358112573623657,
"mask/share_step_conf": 0.12625813484191895,
"num_tokens": 10728631.0,
"reward": 1.1400502920150757,
"reward_std": 0.18121424317359924,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7322214841842651,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8277525901794434,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.7710835337638855,
"adv/mean_abs_reasoning": 0.4401756227016449,
"adv/mean_abs_step_conf": 0.7695859670639038,
"adv/ratio_final_to_reasoning": 1.7517633735172404,
"adv/ratio_step_to_reasoning": 1.748361170799175,
"adv/std_final_conf": 0.9323577284812927,
"adv/std_reasoning": 0.7014464735984802,
"adv/std_step_conf": 0.9356615543365479,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6382832080200502,
"calib/avg_num_step_conf": 5.22265625,
"calib/ece": 0.2381818181818182,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.31620553359683795,
"calib/gap": 0.12006453634085212,
"calib/mean_conf": 0.7455335968379447,
"calib/mu_c": 0.8024812030075188,
"calib/mu_w": 0.6824166666666667,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2290118577075099,
"calib/std_conf": 0.23834784561973427,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4649563953488372,
"calib/step_q_c_n": 688.0,
"calib/step_q_gap": 0.039651310603074474,
"calib/step_q_w": 0.42530508474576273,
"calib/step_q_w_n": 649.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2371.0,
"completions/max_terminated_length": 2371.0,
"completions/mean_length": 530.8359375,
"completions/mean_terminated_length": 532.9176635742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.028869692236185074,
"kl": 0.04286956787109375,
"learning_rate": 4.277777777777778e-06,
"loss": -0.04,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.034582871943712234,
"mask/share_reasoning": 0.8415893316268921,
"mask/share_step_conf": 0.1199214980006218,
"num_tokens": 10969293.0,
"reward": 1.102731466293335,
"reward_std": 0.1884532868862152,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.691124677658081,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8095587491989136,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.7545320391654968,
"adv/mean_abs_reasoning": 0.36301976442337036,
"adv/mean_abs_step_conf": 0.7577338218688965,
"adv/ratio_final_to_reasoning": 2.0784874905200117,
"adv/ratio_step_to_reasoning": 2.0873073483271627,
"adv/std_final_conf": 0.9296429753303528,
"adv/std_reasoning": 0.6611769199371338,
"adv/std_step_conf": 0.9353532791137695,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6598508230452675,
"calib/avg_num_step_conf": 5.78515625,
"calib/ece": 0.21817460317460327,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.39285714285714285,
"calib/gap": 0.14166666666666683,
"calib/mean_conf": 0.7792857142857142,
"calib/mu_c": 0.84,
"calib/mu_w": 0.6983333333333331,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2130158730158731,
"calib/std_conf": 0.22678630796072777,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4585783718104495,
"calib/step_q_c_n": 823.0,
"calib/step_q_gap": 0.03772730798066237,
"calib/step_q_w": 0.42085106382978715,
"calib/step_q_w_n": 658.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2823.0,
"completions/max_terminated_length": 2823.0,
"completions/mean_length": 547.4375,
"completions/mean_terminated_length": 549.5843505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.027648447081446648,
"kl": 0.041889190673828125,
"learning_rate": 4.25e-06,
"loss": -0.0455,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03150080889463425,
"mask/share_reasoning": 0.8493713140487671,
"mask/share_step_conf": 0.11522158980369568,
"num_tokens": 11215413.0,
"reward": 1.137692928314209,
"reward_std": 0.17622481286525726,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7184492349624634,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8317077159881592,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.767864465713501,
"adv/mean_abs_reasoning": 0.5330666303634644,
"adv/mean_abs_step_conf": 0.7646841406822205,
"adv/ratio_final_to_reasoning": 1.440466204365377,
"adv/ratio_step_to_reasoning": 1.43450011147918,
"adv/std_final_conf": 0.9259854555130005,
"adv/std_reasoning": 0.7753552198410034,
"adv/std_step_conf": 0.9356560111045837,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5637729756582215,
"calib/avg_num_step_conf": 5.4375,
"calib/ece": 0.36204724409448824,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4330708661417323,
"calib/gap": 0.023260059612518647,
"calib/mean_conf": 0.7809448818897639,
"calib/mu_c": 0.7930327868852459,
"calib/mu_w": 0.7697727272727273,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3313385826771654,
"calib/std_conf": 0.2423098591228505,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.47289121061359873,
"calib/step_q_c_n": 603.0,
"calib/step_q_gap": 0.026414658015373127,
"calib/step_q_w": 0.4464765525982256,
"calib/step_q_w_n": 789.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3045.0,
"completions/max_terminated_length": 3045.0,
"completions/mean_length": 466.66015625,
"completions/mean_terminated_length": 468.490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.0512,
"grad_norm": 0.03668780252337456,
"kl": 0.04692840576171875,
"learning_rate": 4.222222222222223e-06,
"loss": 0.0596,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03637230396270752,
"mask/share_reasoning": 0.8307284116744995,
"mask/share_step_conf": 0.12899301946163177,
"num_tokens": 11438566.0,
"reward": 1.063523292541504,
"reward_std": 0.19171887636184692,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6065890789031982,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8183259963989258,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.7678335309028625,
"adv/mean_abs_reasoning": 0.4053008556365967,
"adv/mean_abs_step_conf": 0.7596707344055176,
"adv/ratio_final_to_reasoning": 1.8944779420631723,
"adv/ratio_step_to_reasoning": 1.874337850119562,
"adv/std_final_conf": 0.9254300594329834,
"adv/std_reasoning": 0.6613235473632812,
"adv/std_step_conf": 0.935420572757721,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6970525082959204,
"calib/avg_num_step_conf": 5.22265625,
"calib/ece": 0.2572933333333333,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.492,
"calib/gap": 0.142554709697009,
"calib/mean_conf": 0.8173733333333332,
"calib/mu_c": 0.8795271867612292,
"calib/mu_w": 0.7369724770642202,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2553333333333333,
"calib/std_conf": 0.1957939692636114,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.500190336749634,
"calib/step_q_c_n": 683.0,
"calib/step_q_gap": 0.08254507681079609,
"calib/step_q_w": 0.4176452599388379,
"calib/step_q_w_n": 654.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2547.0,
"completions/max_terminated_length": 2547.0,
"completions/mean_length": 487.578125,
"completions/mean_terminated_length": 489.490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.03198077902197838,
"kl": 0.044513702392578125,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0769,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.0343097522854805,
"mask/share_reasoning": 0.8420140743255615,
"mask/share_step_conf": 0.11976996064186096,
"num_tokens": 11667922.0,
"reward": 1.1165080070495605,
"reward_std": 0.1887185424566269,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7010080218315125,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8182135224342346,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.7431448101997375,
"adv/mean_abs_reasoning": 0.4307158887386322,
"adv/mean_abs_step_conf": 0.7490586042404175,
"adv/ratio_final_to_reasoning": 1.7253712473344442,
"adv/ratio_step_to_reasoning": 1.7391013979866496,
"adv/std_final_conf": 0.9296773672103882,
"adv/std_reasoning": 0.7013258934020996,
"adv/std_step_conf": 0.9353389143943787,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6683548387096774,
"calib/avg_num_step_conf": 5.4375,
"calib/ece": 0.2447054901960784,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5058823529411764,
"calib/gap": 0.09710064516129058,
"calib/mean_conf": 0.8279219607843138,
"calib/mu_c": 0.8660006451612905,
"calib/mu_w": 0.7688999999999999,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.23239215686274506,
"calib/std_conf": 0.19343981033801264,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4770713391739675,
"calib/step_q_c_n": 799.0,
"calib/step_q_gap": 0.034895959747323235,
"calib/step_q_w": 0.44217537942664425,
"calib/step_q_w_n": 593.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1451.0,
"completions/max_terminated_length": 1451.0,
"completions/mean_length": 483.6875,
"completions/mean_terminated_length": 485.5843505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.03353298455476761,
"kl": 0.040256500244140625,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0523,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.034353211522102356,
"mask/share_reasoning": 0.836700975894928,
"mask/share_step_conf": 0.12503957748413086,
"num_tokens": 11897106.0,
"reward": 1.1516687870025635,
"reward_std": 0.17538967728614807,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7192476987838745,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8425183296203613,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.7491422891616821,
"adv/mean_abs_reasoning": 0.4412249028682709,
"adv/mean_abs_step_conf": 0.7408877611160278,
"adv/ratio_final_to_reasoning": 1.6978694636040093,
"adv/ratio_step_to_reasoning": 1.6791612538180383,
"adv/std_final_conf": 0.9151452779769897,
"adv/std_reasoning": 0.7205055952072144,
"adv/std_step_conf": 0.93574458360672,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7797020123839009,
"calib/avg_num_step_conf": 4.98046875,
"calib/ece": 0.1741732283464567,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.43700787401574803,
"calib/gap": 0.2205959752321981,
"calib/mean_conf": 0.7725984251968504,
"calib/mu_c": 0.8611842105263157,
"calib/mu_w": 0.6405882352941176,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1741732283464567,
"calib/std_conf": 0.23705447262105386,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4693412384716732,
"calib/step_q_c_n": 759.0,
"calib/step_q_gap": 0.04350790513833991,
"calib/step_q_w": 0.4258333333333333,
"calib/step_q_w_n": 516.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2990.0,
"completions/max_terminated_length": 2990.0,
"completions/mean_length": 506.9375,
"completions/mean_terminated_length": 506.9375,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.0544,
"grad_norm": 0.032079264521598816,
"kl": 0.040374755859375,
"learning_rate": 4.138888888888889e-06,
"loss": 0.0071,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03356383740901947,
"mask/share_reasoning": 0.8532578945159912,
"mask/share_step_conf": 0.11317827552556992,
"num_tokens": 12136178.0,
"reward": 1.1587438583374023,
"reward_std": 0.19736525416374207,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7653058767318726,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8243710994720459,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.7362066507339478,
"adv/mean_abs_reasoning": 0.455300509929657,
"adv/mean_abs_step_conf": 0.7593135833740234,
"adv/ratio_final_to_reasoning": 1.6169686496676452,
"adv/ratio_step_to_reasoning": 1.6677195979669248,
"adv/std_final_conf": 0.9249099493026733,
"adv/std_reasoning": 0.7391347885131836,
"adv/std_step_conf": 0.9352485537528992,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.779368100494861,
"calib/avg_num_step_conf": 5.06640625,
"calib/ece": 0.06097656249999997,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.44921875,
"calib/gap": 0.28866920441568344,
"calib/mean_conf": 0.7441015625,
"calib/mu_c": 0.8241621621621622,
"calib/mu_w": 0.5354929577464788,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.041210937499999975,
"calib/std_conf": 0.2688836394428984,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.48005662514156283,
"calib/step_q_c_n": 883.0,
"calib/step_q_gap": 0.10493585219470292,
"calib/step_q_w": 0.3751207729468599,
"calib/step_q_w_n": 414.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1410.0,
"completions/max_terminated_length": 1410.0,
"completions/mean_length": 460.40625,
"completions/mean_terminated_length": 462.2117919921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.06597699224948883,
"kl": 0.044063568115234375,
"learning_rate": 4.111111111111111e-06,
"loss": -0.021,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03548233211040497,
"mask/share_reasoning": 0.8447404503822327,
"mask/share_step_conf": 0.11587096750736237,
"num_tokens": 12361994.0,
"reward": 1.2292792797088623,
"reward_std": 0.13924862444400787,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.8377586007118225,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8518874645233154,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.7574901580810547,
"adv/mean_abs_reasoning": 0.44077375531196594,
"adv/mean_abs_step_conf": 0.7509580850601196,
"adv/ratio_final_to_reasoning": 1.718546417413003,
"adv/ratio_step_to_reasoning": 1.703726857622489,
"adv/std_final_conf": 0.9158998131752014,
"adv/std_reasoning": 0.7014303803443909,
"adv/std_step_conf": 0.9353622198104858,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6886863136863137,
"calib/avg_num_step_conf": 5.62890625,
"calib/ece": 0.240156862745098,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5254901960784314,
"calib/gap": 0.1623033216783215,
"calib/mean_conf": 0.8003921568627452,
"calib/mu_c": 0.8716783216783215,
"calib/mu_w": 0.709375,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.23988235294117646,
"calib/std_conf": 0.22842822517397507,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4806020942408376,
"calib/step_q_c_n": 764.0,
"calib/step_q_gap": 0.07710135568840037,
"calib/step_q_w": 0.40350073855243723,
"calib/step_q_w_n": 677.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2379.0,
"completions/max_terminated_length": 2379.0,
"completions/mean_length": 504.078125,
"completions/mean_terminated_length": 506.054931640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.031964704394340515,
"kl": 0.038970947265625,
"learning_rate": 4.083333333333334e-06,
"loss": -0.1266,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03250448405742645,
"mask/share_reasoning": 0.8465020060539246,
"mask/share_step_conf": 0.11708725988864899,
"num_tokens": 12596862.0,
"reward": 1.1403965950012207,
"reward_std": 0.17160692811012268,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7205559015274048,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8333872556686401,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.6885330677032471,
"adv/mean_abs_reasoning": 0.3714979887008667,
"adv/mean_abs_step_conf": 0.7581756114959717,
"adv/ratio_final_to_reasoning": 1.8533964883929954,
"adv/ratio_step_to_reasoning": 2.0408606090905677,
"adv/std_final_conf": 0.897620677947998,
"adv/std_reasoning": 0.6611714959144592,
"adv/std_step_conf": 0.935348391532898,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.752979414951246,
"calib/avg_num_step_conf": 5.24609375,
"calib/ece": 0.1518181818181819,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6719367588932806,
"calib/gap": 0.20716375174121626,
"calib/mean_conf": 0.848181818181818,
"calib/mu_c": 0.9063186813186812,
"calib/mu_w": 0.699154929577465,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14031620553359694,
"calib/std_conf": 0.22338364203121278,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5111764705882353,
"calib/step_q_c_n": 884.0,
"calib/step_q_gap": 0.13037037037037036,
"calib/step_q_w": 0.380806100217865,
"calib/step_q_w_n": 459.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2707.0,
"completions/max_terminated_length": 2707.0,
"completions/mean_length": 455.734375,
"completions/mean_terminated_length": 457.5216064453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.0576,
"grad_norm": 0.0508962981402874,
"kl": 0.044342041015625,
"learning_rate": 4.055555555555556e-06,
"loss": 0.0584,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.036514509469270706,
"mask/share_reasoning": 0.8346399068832397,
"mask/share_step_conf": 0.12493934482336044,
"num_tokens": 12819762.0,
"reward": 1.2022638320922852,
"reward_std": 0.15340590476989746,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.8057184219360352,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.839310348033905,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.7237907648086548,
"adv/mean_abs_reasoning": 0.5115185379981995,
"adv/mean_abs_step_conf": 0.776884913444519,
"adv/ratio_final_to_reasoning": 1.414984425864938,
"adv/ratio_step_to_reasoning": 1.5187815411046817,
"adv/std_final_conf": 0.888651967048645,
"adv/std_reasoning": 0.775328516960144,
"adv/std_step_conf": 0.9353832006454468,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8313030984507745,
"calib/avg_num_step_conf": 4.91015625,
"calib/ece": 0.23161417322834646,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.610236220472441,
"calib/gap": 0.2941929035482257,
"calib/mean_conf": 0.7703543307086614,
"calib/mu_c": 0.9047101449275362,
"calib/mu_w": 0.6105172413793105,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22933070866141733,
"calib/std_conf": 0.3026605652489857,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5009267563527653,
"calib/step_q_c_n": 669.0,
"calib/step_q_gap": 0.06662403526432986,
"calib/step_q_w": 0.43430272108843543,
"calib/step_q_w_n": 588.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1981.0,
"completions/max_terminated_length": 1981.0,
"completions/mean_length": 474.203125,
"completions/mean_terminated_length": 476.0627746582031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.05889483913779259,
"kl": 0.050426483154296875,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0184,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03481750190258026,
"mask/share_reasoning": 0.8439182043075562,
"mask/share_step_conf": 0.11735805869102478,
"num_tokens": 13048982.0,
"reward": 1.160832405090332,
"reward_std": 0.20851004123687744,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7488183379173279,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8443976640701294,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.7262025475502014,
"adv/mean_abs_reasoning": 0.4655173420906067,
"adv/mean_abs_step_conf": 0.7637791633605957,
"adv/ratio_final_to_reasoning": 1.5599903202077827,
"adv/ratio_step_to_reasoning": 1.64071044041134,
"adv/std_final_conf": 0.893416702747345,
"adv/std_reasoning": 0.73926842212677,
"adv/std_step_conf": 0.9353504180908203,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.697511811023622,
"calib/avg_num_step_conf": 5.6328125,
"calib/ece": 0.35551587301587306,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6428571428571429,
"calib/gap": 0.14661228346456678,
"calib/mean_conf": 0.8290079365079365,
"calib/mu_c": 0.9017322834645669,
"calib/mu_w": 0.7551200000000001,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3402777777777778,
"calib/std_conf": 0.2505791069221469,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4982884097035041,
"calib/step_q_c_n": 742.0,
"calib/step_q_gap": 0.03434555256064692,
"calib/step_q_w": 0.46394285714285716,
"calib/step_q_w_n": 700.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2361.0,
"completions/max_terminated_length": 2361.0,
"completions/mean_length": 489.61328125,
"completions/mean_terminated_length": 489.61328125,
"completions/min_length": 98.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.032840728759765625,
"kl": 0.04604339599609375,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0056,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03335815668106079,
"mask/share_reasoning": 0.8387473821640015,
"mask/share_step_conf": 0.12789444625377655,
"num_tokens": 13281163.0,
"reward": 1.087411642074585,
"reward_std": 0.19331833720207214,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6446441411972046,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8227236270904541,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.6837377548217773,
"adv/mean_abs_reasoning": 0.43261250853538513,
"adv/mean_abs_step_conf": 0.7604891657829285,
"adv/ratio_final_to_reasoning": 1.5804854028298436,
"adv/ratio_step_to_reasoning": 1.7578991609779702,
"adv/std_final_conf": 0.8755529522895813,
"adv/std_reasoning": 0.7012878656387329,
"adv/std_step_conf": 0.9352318644523621,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7409663865546218,
"calib/avg_num_step_conf": 5.40625,
"calib/ece": 0.18948818897637798,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7047244094488189,
"calib/gap": 0.27577871148459376,
"calib/mean_conf": 0.8283858267716536,
"calib/mu_c": 0.9195882352941176,
"calib/mu_w": 0.6438095238095238,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1742913385826772,
"calib/std_conf": 0.2828360184672855,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5188636363636364,
"calib/step_q_c_n": 924.0,
"calib/step_q_gap": 0.0707114624505929,
"calib/step_q_w": 0.4481521739130435,
"calib/step_q_w_n": 460.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1821.0,
"completions/max_terminated_length": 1821.0,
"completions/mean_length": 486.140625,
"completions/mean_terminated_length": 489.968505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.0608,
"grad_norm": 0.03787647560238838,
"kl": 0.0426483154296875,
"learning_rate": 3.972222222222223e-06,
"loss": -0.0413,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03327067941427231,
"mask/share_reasoning": 0.8391324877738953,
"mask/share_step_conf": 0.11978430300951004,
"num_tokens": 13512407.0,
"reward": 1.1986336708068848,
"reward_std": 0.1828002631664276,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7892199754714966,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8511983156204224,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.7383530139923096,
"adv/mean_abs_reasoning": 0.5903604030609131,
"adv/mean_abs_step_conf": 0.7504562139511108,
"adv/ratio_final_to_reasoning": 1.2506818041387622,
"adv/ratio_step_to_reasoning": 1.2711831790549122,
"adv/std_final_conf": 0.9139520525932312,
"adv/std_reasoning": 0.8266242742538452,
"adv/std_step_conf": 0.9352567195892334,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.588135026737968,
"calib/avg_num_step_conf": 7.5,
"calib/ece": 0.29032520325203254,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.6138211382113821,
"calib/gap": 0.10848128342245988,
"calib/mean_conf": 0.775609756097561,
"calib/mu_c": 0.8241176470588235,
"calib/mu_w": 0.7156363636363636,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.25654471544715446,
"calib/std_conf": 0.30846761657250876,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4922186495176849,
"calib/step_q_c_n": 933.0,
"calib/step_q_gap": 0.11768977413774573,
"calib/step_q_w": 0.3745288753799392,
"calib/step_q_w_n": 987.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2666.0,
"completions/max_terminated_length": 2666.0,
"completions/mean_length": 593.53515625,
"completions/mean_terminated_length": 602.9563598632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.030713409185409546,
"kl": 0.037036895751953125,
"learning_rate": 3.944444444444445e-06,
"loss": -0.0777,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.02800177037715912,
"mask/share_reasoning": 0.8364847302436829,
"mask/share_step_conf": 0.11988846957683563,
"num_tokens": 13770672.0,
"reward": 1.0557935237884521,
"reward_std": 0.2658042907714844,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6286625266075134,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.791220486164093,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.6902940273284912,
"adv/mean_abs_reasoning": 0.4767693281173706,
"adv/mean_abs_step_conf": 0.7880936861038208,
"adv/ratio_final_to_reasoning": 1.4478574577233612,
"adv/ratio_step_to_reasoning": 1.6529873874558656,
"adv/std_final_conf": 0.8746950626373291,
"adv/std_reasoning": 0.7206186056137085,
"adv/std_step_conf": 0.9348664879798889,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6117868059093224,
"calib/avg_num_step_conf": 5.41796875,
"calib/ece": 0.3316078431372549,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.7411764705882353,
"calib/gap": 0.08438104941416191,
"calib/mean_conf": 0.8422745098039216,
"calib/mu_c": 0.8766887417218543,
"calib/mu_w": 0.7923076923076924,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2908627450980392,
"calib/std_conf": 0.2866211599834351,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5637988165680473,
"calib/step_q_c_n": 845.0,
"calib/step_q_gap": 0.04055158409572257,
"calib/step_q_w": 0.5232472324723247,
"calib/step_q_w_n": 542.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2047.0,
"completions/max_terminated_length": 2047.0,
"completions/mean_length": 518.16015625,
"completions/mean_terminated_length": 520.1921997070312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.036398790776729584,
"kl": 0.04123687744140625,
"learning_rate": 3.916666666666667e-06,
"loss": 0.0292,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03273506462574005,
"mask/share_reasoning": 0.848429799079895,
"mask/share_step_conf": 0.11492891609668732,
"num_tokens": 14009569.0,
"reward": 1.1045868396759033,
"reward_std": 0.20788058638572693,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6519827842712402,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8266689777374268,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.6342928409576416,
"adv/mean_abs_reasoning": 0.42958423495292664,
"adv/mean_abs_step_conf": 0.7749303579330444,
"adv/ratio_final_to_reasoning": 1.476527277652884,
"adv/ratio_step_to_reasoning": 1.803907813372063,
"adv/std_final_conf": 0.8228415846824646,
"adv/std_reasoning": 0.7013217806816101,
"adv/std_step_conf": 0.9355958104133606,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7317820156605204,
"calib/avg_num_step_conf": 5.2109375,
"calib/ece": 0.24665921568627455,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.7098039215686275,
"calib/gap": 0.30096544581965146,
"calib/mean_conf": 0.795536862745098,
"calib/mu_c": 0.9218243243243244,
"calib/mu_w": 0.6208588785046729,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23090196078431377,
"calib/std_conf": 0.33709905516639943,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5604714673913044,
"calib/step_q_c_n": 736.0,
"calib/step_q_gap": 0.07986946070234119,
"calib/step_q_w": 0.4806020066889632,
"calib/step_q_w_n": 598.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2408.0,
"completions/max_terminated_length": 2408.0,
"completions/mean_length": 490.54296875,
"completions/mean_terminated_length": 490.54296875,
"completions/min_length": 190.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.064,
"grad_norm": 0.035494614392519,
"kl": 0.043849945068359375,
"learning_rate": 3.88888888888889e-06,
"loss": 0.0345,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03389373794198036,
"mask/share_reasoning": 0.8445810675621033,
"mask/share_step_conf": 0.12152522802352905,
"num_tokens": 14244004.0,
"reward": 1.1639959812164307,
"reward_std": 0.20684689283370972,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7402294874191284,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.848612368106842,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.593455970287323,
"adv/mean_abs_reasoning": 0.38843584060668945,
"adv/mean_abs_step_conf": 0.7466306686401367,
"adv/ratio_final_to_reasoning": 1.5278095073832967,
"adv/ratio_step_to_reasoning": 1.9221466986001874,
"adv/std_final_conf": 0.8274534344673157,
"adv/std_reasoning": 0.6612657308578491,
"adv/std_step_conf": 0.9348787069320679,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6280560839850631,
"calib/avg_num_step_conf": 5.46875,
"calib/ece": 0.2711417322834646,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.8582677165354331,
"calib/gap": 0.12214683294581852,
"calib/mean_conf": 0.9068110236220471,
"calib/mu_c": 0.9467251461988305,
"calib/mu_w": 0.824578313253012,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2523622047244095,
"calib/std_conf": 0.240056726638651,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5864446952595938,
"calib/step_q_c_n": 886.0,
"calib/step_q_gap": 0.09193107658255101,
"calib/step_q_w": 0.49451361867704274,
"calib/step_q_w_n": 514.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2640.0,
"completions/max_terminated_length": 2640.0,
"completions/mean_length": 459.90234375,
"completions/mean_terminated_length": 459.90234375,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.03402427211403847,
"kl": 0.05214691162109375,
"learning_rate": 3.861111111111112e-06,
"loss": 0.047,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03737509623169899,
"mask/share_reasoning": 0.8354184031486511,
"mask/share_step_conf": 0.1272064745426178,
"num_tokens": 14465803.0,
"reward": 1.1424577236175537,
"reward_std": 0.19859057664871216,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.715925395488739,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8246393203735352,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.6989138126373291,
"adv/mean_abs_reasoning": 0.5794708132743835,
"adv/mean_abs_step_conf": 0.7784146666526794,
"adv/ratio_final_to_reasoning": 1.206124271709244,
"adv/ratio_step_to_reasoning": 1.3433198856973225,
"adv/std_final_conf": 0.8719269037246704,
"adv/std_reasoning": 0.7929136157035828,
"adv/std_step_conf": 0.9353814125061035,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5977225939269172,
"calib/avg_num_step_conf": 5.80078125,
"calib/ece": 0.3476800000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.7,
"calib/gap": 0.12513381369016985,
"calib/mean_conf": 0.80552,
"calib/mu_c": 0.8635820895522388,
"calib/mu_w": 0.738448275862069,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3086000000000001,
"calib/std_conf": 0.3257335254467983,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.556652719665272,
"calib/step_q_c_n": 717.0,
"calib/step_q_gap": 0.06268136549860531,
"calib/step_q_w": 0.49397135416666665,
"calib/step_q_w_n": 768.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2586.0,
"completions/max_terminated_length": 2586.0,
"completions/mean_length": 528.84765625,
"completions/mean_terminated_length": 530.9215698242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.04951612651348114,
"kl": 0.04522705078125,
"learning_rate": 3.833333333333334e-06,
"loss": 0.0221,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.032482948154211044,
"mask/share_reasoning": 0.8465949296951294,
"mask/share_step_conf": 0.11701588332653046,
"num_tokens": 14708268.0,
"reward": 1.0481505393981934,
"reward_std": 0.2715667188167572,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6199171543121338,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7842558026313782,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.6785953044891357,
"adv/mean_abs_reasoning": 0.4569165110588074,
"adv/mean_abs_step_conf": 0.7319062352180481,
"adv/ratio_final_to_reasoning": 1.485162580176047,
"adv/ratio_step_to_reasoning": 1.6018380108917714,
"adv/std_final_conf": 0.8627391457557678,
"adv/std_reasoning": 0.7206259369850159,
"adv/std_step_conf": 0.9355800747871399,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6900068681318682,
"calib/avg_num_step_conf": 5.28515625,
"calib/ece": 0.228207171314741,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6454183266932271,
"calib/gap": 0.28906662087912094,
"calib/mean_conf": 0.733386454183267,
"calib/mu_c": 0.8381875000000001,
"calib/mu_w": 0.5491208791208791,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.16207171314741034,
"calib/std_conf": 0.38128959993644995,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5746572104018913,
"calib/step_q_c_n": 846.0,
"calib/step_q_gap": 0.046964902709583756,
"calib/step_q_w": 0.5276923076923076,
"calib/step_q_w_n": 507.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2154.0,
"completions/max_terminated_length": 2154.0,
"completions/mean_length": 534.83984375,
"completions/mean_terminated_length": 536.9373168945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.0672,
"grad_norm": 0.03342531993985176,
"kl": 0.044696807861328125,
"learning_rate": 3.8055555555555556e-06,
"loss": -0.0119,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03186768665909767,
"mask/share_reasoning": 0.8521930575370789,
"mask/share_step_conf": 0.11203300207853317,
"num_tokens": 14953827.0,
"reward": 1.1439241170883179,
"reward_std": 0.2590183615684509,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7329293489456177,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8230709433555603,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.6654384732246399,
"adv/mean_abs_reasoning": 0.48860666155815125,
"adv/mean_abs_step_conf": 0.7677325010299683,
"adv/ratio_final_to_reasoning": 1.3619103577150946,
"adv/ratio_step_to_reasoning": 1.5712690010850312,
"adv/std_final_conf": 0.8467351198196411,
"adv/std_reasoning": 0.739334762096405,
"adv/std_step_conf": 0.9357771873474121,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6519786910197869,
"calib/avg_num_step_conf": 5.35546875,
"calib/ece": 0.23972332015810277,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.616600790513834,
"calib/gap": 0.1977564687975647,
"calib/mean_conf": 0.7331620553359685,
"calib/mu_c": 0.7902222222222223,
"calib/mu_w": 0.5924657534246576,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1307114624505929,
"calib/std_conf": 0.36325314486712246,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5402518711018711,
"calib/step_q_c_n": 962.0,
"calib/step_q_gap": 0.01497069750773905,
"calib/step_q_w": 0.5252811735941321,
"calib/step_q_w_n": 409.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1829.0,
"completions/max_terminated_length": 1829.0,
"completions/mean_length": 476.40625,
"completions/mean_terminated_length": 482.05535888671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.07387295365333557,
"kl": 0.04605865478515625,
"learning_rate": 3.777777777777778e-06,
"loss": -0.0693,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03335306793451309,
"mask/share_reasoning": 0.8404361009597778,
"mask/share_step_conf": 0.11449208855628967,
"num_tokens": 15179563.0,
"reward": 1.1393107175827026,
"reward_std": 0.21538883447647095,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.734772264957428,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8037118911743164,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.5958322286605835,
"adv/mean_abs_reasoning": 0.33800065517425537,
"adv/mean_abs_step_conf": 0.7735382318496704,
"adv/ratio_final_to_reasoning": 1.7628138275454044,
"adv/ratio_step_to_reasoning": 2.2885702143117883,
"adv/std_final_conf": 0.8070009350776672,
"adv/std_reasoning": 0.6401605010032654,
"adv/std_step_conf": 0.9356332421302795,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6378621378621377,
"calib/avg_num_step_conf": 5.01171875,
"calib/ece": 0.36969934640522883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.8823529411764706,
"calib/gap": 0.12563457375957354,
"calib/mean_conf": 0.9171503267973856,
"calib/mu_c": 0.9723310023310022,
"calib/mu_w": 0.8466964285714287,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3630326797385622,
"calib/std_conf": 0.23368243085687596,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6310841950399327,
"calib/step_q_c_n": 793.0,
"calib/step_q_gap": 0.029611746060340893,
"calib/step_q_w": 0.6014724489795918,
"calib/step_q_w_n": 490.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 968.0,
"completions/max_terminated_length": 968.0,
"completions/mean_length": 411.94921875,
"completions/mean_terminated_length": 413.5647277832031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.06507488340139389,
"kl": 0.05391693115234375,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0078,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03861987590789795,
"mask/share_reasoning": 0.8285122513771057,
"mask/share_step_conf": 0.12896165251731873,
"num_tokens": 15390046.0,
"reward": 1.0620718002319336,
"reward_std": 0.21548622846603394,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6281029582023621,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7900686264038086,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.6308272480964661,
"adv/mean_abs_reasoning": 0.4362999498844147,
"adv/mean_abs_step_conf": 0.7803380489349365,
"adv/ratio_final_to_reasoning": 1.4458567970580467,
"adv/ratio_step_to_reasoning": 1.7885357290131823,
"adv/std_final_conf": 0.8291295766830444,
"adv/std_reasoning": 0.7013883590698242,
"adv/std_step_conf": 0.9355806708335876,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7408925318761385,
"calib/avg_num_step_conf": 6.2578125,
"calib/ece": 0.27556451612903216,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6532258064516129,
"calib/gap": 0.30061670569867294,
"calib/mean_conf": 0.7483064516129032,
"calib/mu_c": 0.8961904761904762,
"calib/mu_w": 0.5955737704918033,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2579032258064515,
"calib/std_conf": 0.3661376917407103,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5993078880407124,
"calib/step_q_c_n": 655.0,
"calib/step_q_gap": 0.11990978877988878,
"calib/step_q_w": 0.47939809926082366,
"calib/step_q_w_n": 947.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2804.0,
"completions/max_terminated_length": 2804.0,
"completions/mean_length": 573.8828125,
"completions/mean_terminated_length": 578.4015502929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.0704,
"grad_norm": 0.04619403928518295,
"kl": 0.041259765625,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.1041,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.031027261167764664,
"mask/share_reasoning": 0.8438991904258728,
"mask/share_step_conf": 0.11726106703281403,
"num_tokens": 15643312.0,
"reward": 1.090234637260437,
"reward_std": 0.222636416554451,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.6864187717437744,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8012419939041138,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.5185285210609436,
"adv/mean_abs_reasoning": 0.3017883002758026,
"adv/mean_abs_step_conf": 0.7628713846206665,
"adv/ratio_final_to_reasoning": 1.7181862934615535,
"adv/ratio_step_to_reasoning": 2.5278361815997594,
"adv/std_final_conf": 0.7563133239746094,
"adv/std_reasoning": 0.5960524678230286,
"adv/std_step_conf": 0.9346969127655029,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7570650323459313,
"calib/avg_num_step_conf": 5.3984375,
"calib/ece": 0.22109448818897628,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7047244094488189,
"calib/gap": 0.2940209737827715,
"calib/mean_conf": 0.8211102362204725,
"calib/mu_c": 0.9241333333333334,
"calib/mu_w": 0.6301123595505619,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1962992125984251,
"calib/std_conf": 0.3068948556344743,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6197363885624755,
"calib/step_q_c_n": 851.0,
"calib/step_q_gap": 0.07519778215946238,
"calib/step_q_w": 0.5445386064030131,
"calib/step_q_w_n": 531.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2935.0,
"completions/max_terminated_length": 2935.0,
"completions/mean_length": 509.9296875,
"completions/mean_terminated_length": 509.9296875,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.03376888856291771,
"kl": 0.0507659912109375,
"learning_rate": 3.694444444444445e-06,
"loss": 0.1016,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.033446695655584335,
"mask/share_reasoning": 0.8498560190200806,
"mask/share_step_conf": 0.1166972666978836,
"num_tokens": 15878862.0,
"reward": 1.1770553588867188,
"reward_std": 0.146753191947937,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7765185832977295,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8334989547729492,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.5941345691680908,
"adv/mean_abs_reasoning": 0.3606022596359253,
"adv/mean_abs_step_conf": 0.7671284079551697,
"adv/ratio_final_to_reasoning": 1.6476174324807245,
"adv/ratio_step_to_reasoning": 2.127353302582422,
"adv/std_final_conf": 0.8287256360054016,
"adv/std_reasoning": 0.6611523628234863,
"adv/std_step_conf": 0.9351922273635864,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7173514538558786,
"calib/avg_num_step_conf": 5.4609375,
"calib/ece": 0.30529644268774697,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.758893280632411,
"calib/gap": 0.2553065739570166,
"calib/mean_conf": 0.8456126482213439,
"calib/mu_c": 0.9596428571428574,
"calib/mu_w": 0.7043362831858407,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.29877470355731217,
"calib/std_conf": 0.3038841445004291,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6220592592592592,
"calib/step_q_c_n": 675.0,
"calib/step_q_gap": 0.14553062855386495,
"calib/step_q_w": 0.47652863070539425,
"calib/step_q_w_n": 723.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2804.0,
"completions/max_terminated_length": 2804.0,
"completions/mean_length": 489.4921875,
"completions/mean_terminated_length": 489.4921875,
"completions/min_length": 125.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.9474548101425171,
"kl": 1.2303543090820312,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0606,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.036423999816179276,
"mask/share_reasoning": 0.8375931978225708,
"mask/share_step_conf": 0.12598282098770142,
"num_tokens": 16108260.0,
"reward": 1.1203961372375488,
"reward_std": 0.1978849619626999,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6930711269378662,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8276473879814148,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.6566118001937866,
"adv/mean_abs_reasoning": 0.47426342964172363,
"adv/mean_abs_step_conf": 0.765344500541687,
"adv/ratio_final_to_reasoning": 1.384487521396739,
"adv/ratio_step_to_reasoning": 1.6137539871456186,
"adv/std_final_conf": 0.8505991697311401,
"adv/std_reasoning": 0.7206820249557495,
"adv/std_step_conf": 0.9354822039604187,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.70145585176782,
"calib/avg_num_step_conf": 5.05078125,
"calib/ece": 0.24475555555555553,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5158730158730159,
"calib/gap": 0.2759776139156741,
"calib/mean_conf": 0.6880222222222222,
"calib/mu_c": 0.8227255813953489,
"calib/mu_w": 0.5467479674796748,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21043650793650792,
"calib/std_conf": 0.3722595145497822,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6086429725363489,
"calib/step_q_c_n": 619.0,
"calib/step_q_gap": 0.07013481229896013,
"calib/step_q_w": 0.5385081602373888,
"calib/step_q_w_n": 674.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2458.0,
"completions/max_terminated_length": 2458.0,
"completions/mean_length": 547.37890625,
"completions/mean_terminated_length": 553.8695678710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.0736,
"grad_norm": 0.056202232837677,
"kl": 0.037876129150390625,
"learning_rate": 3.638888888888889e-06,
"loss": -0.0615,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.030722718685865402,
"mask/share_reasoning": 0.8574061393737793,
"mask/share_step_conf": 0.1001524031162262,
"num_tokens": 16352885.0,
"reward": 1.1116247177124023,
"reward_std": 0.22466173768043518,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.707231879234314,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8122408390045166,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.6317081451416016,
"adv/mean_abs_reasoning": 0.43293261528015137,
"adv/mean_abs_step_conf": 0.7767082452774048,
"adv/ratio_final_to_reasoning": 1.4591373411144417,
"adv/ratio_step_to_reasoning": 1.7940626736444785,
"adv/std_final_conf": 0.8324445486068726,
"adv/std_reasoning": 0.7013914585113525,
"adv/std_step_conf": 0.9354316592216492,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7892195506503745,
"calib/avg_num_step_conf": 5.28515625,
"calib/ece": 0.17151147098515523,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5465587044534413,
"calib/gap": 0.42268690053869395,
"calib/mean_conf": 0.659851551956815,
"calib/mu_c": 0.861782945736434,
"calib/mu_w": 0.43909604519774004,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1545479082321188,
"calib/std_conf": 0.39630890942845737,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5988285570638512,
"calib/step_q_c_n": 663.0,
"calib/step_q_gap": 0.11718604498655644,
"calib/step_q_w": 0.48164251207729475,
"calib/step_q_w_n": 690.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2998.0,
"completions/max_terminated_length": 2998.0,
"completions/mean_length": 557.1015625,
"completions/mean_terminated_length": 557.1015625,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.05277327448129654,
"kl": 0.04144287109375,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.068,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.034348566085100174,
"mask/share_reasoning": 0.8481545448303223,
"mask/share_step_conf": 0.11749683320522308,
"num_tokens": 16602495.0,
"reward": 1.129831314086914,
"reward_std": 0.22639545798301697,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7542862892150879,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8082718849182129,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.6897363066673279,
"adv/mean_abs_reasoning": 0.5863041877746582,
"adv/mean_abs_step_conf": 0.7683776021003723,
"adv/ratio_final_to_reasoning": 1.176413747418811,
"adv/ratio_step_to_reasoning": 1.3105442842166646,
"adv/std_final_conf": 0.873703122138977,
"adv/std_reasoning": 0.7929514050483704,
"adv/std_step_conf": 0.9352450966835022,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6145792563600783,
"calib/avg_num_step_conf": 5.94921875,
"calib/ece": 0.2747410358565736,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.4701195219123506,
"calib/gap": 0.18351272015655573,
"calib/mean_conf": 0.6196015936254979,
"calib/mu_c": 0.6963698630136986,
"calib/mu_w": 0.5128571428571429,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15633466135458163,
"calib/std_conf": 0.39579845246986506,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5789646464646465,
"calib/step_q_c_n": 792.0,
"calib/step_q_gap": 0.08642930674736432,
"calib/step_q_w": 0.4925353397172822,
"calib/step_q_w_n": 731.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2698.0,
"completions/max_terminated_length": 2698.0,
"completions/mean_length": 523.3671875,
"completions/mean_terminated_length": 527.4881591796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.04673464596271515,
"kl": 0.042041778564453125,
"learning_rate": 3.5833333333333335e-06,
"loss": -0.0022,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03401986509561539,
"mask/share_reasoning": 0.8340585231781006,
"mask/share_step_conf": 0.12410911917686462,
"num_tokens": 16840885.0,
"reward": 1.0984848737716675,
"reward_std": 0.22213947772979736,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6739937663078308,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8090673685073853,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.6500765681266785,
"adv/mean_abs_reasoning": 0.39389580488204956,
"adv/mean_abs_step_conf": 0.7558625936508179,
"adv/ratio_final_to_reasoning": 1.650376978047129,
"adv/ratio_step_to_reasoning": 1.9189404514657316,
"adv/std_final_conf": 0.8562666773796082,
"adv/std_reasoning": 0.6815637350082397,
"adv/std_step_conf": 0.9354879856109619,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.825126809352963,
"calib/avg_num_step_conf": 5.28125,
"calib/ece": 0.14211764705882357,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.4392156862745098,
"calib/gap": 0.4700569095632809,
"calib/mean_conf": 0.604235294117647,
"calib/mu_c": 0.8217518248175182,
"calib/mu_w": 0.3516949152542373,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.10454901960784319,
"calib/std_conf": 0.39829256112124584,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5813125845737483,
"calib/step_q_c_n": 739.0,
"calib/step_q_gap": 0.07480361230621158,
"calib/step_q_w": 0.5065089722675368,
"calib/step_q_w_n": 613.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1605.0,
"completions/max_terminated_length": 1605.0,
"completions/mean_length": 460.70703125,
"completions/mean_terminated_length": 462.5137634277344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.0768,
"grad_norm": 0.052089888602495193,
"kl": 0.0478057861328125,
"learning_rate": 3.555555555555556e-06,
"loss": -0.0143,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.034116968512535095,
"mask/share_reasoning": 0.8442560434341431,
"mask/share_step_conf": 0.11772073805332184,
"num_tokens": 17063234.0,
"reward": 1.1859991550445557,
"reward_std": 0.17564892768859863,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.814927339553833,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8344014286994934,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.7459884881973267,
"adv/mean_abs_reasoning": 0.5748550295829773,
"adv/mean_abs_step_conf": 0.7456186413764954,
"adv/ratio_final_to_reasoning": 1.2976984627558994,
"adv/ratio_step_to_reasoning": 1.2970550886845276,
"adv/std_final_conf": 0.913640558719635,
"adv/std_reasoning": 0.7929010391235352,
"adv/std_step_conf": 0.9357115030288696,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7170414758955022,
"calib/avg_num_step_conf": 4.7421875,
"calib/ece": 0.20872843915343914,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.3888888888888889,
"calib/gap": 0.3000431412155489,
"calib/mean_conf": 0.5766683862433862,
"calib/mu_c": 0.6885892405063291,
"calib/mu_w": 0.38854609929078016,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.07920634920634921,
"calib/std_conf": 0.3867900005611315,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6130971128608923,
"calib/step_q_c_n": 762.0,
"calib/step_q_gap": 0.11475640489629052,
"calib/step_q_w": 0.4983407079646018,
"calib/step_q_w_n": 452.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2013.0,
"completions/max_terminated_length": 2013.0,
"completions/mean_length": 467.76953125,
"completions/mean_terminated_length": 469.60394287109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.0889710932970047,
"kl": 0.0463409423828125,
"learning_rate": 3.5277777777777784e-06,
"loss": -0.0218,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.034170910716056824,
"mask/share_reasoning": 0.8531113266944885,
"mask/share_step_conf": 0.10881149023771286,
"num_tokens": 17290015.0,
"reward": 1.1512796878814697,
"reward_std": 0.221408873796463,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7386392951011658,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8295924663543701,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.7070341110229492,
"adv/mean_abs_reasoning": 0.44265496730804443,
"adv/mean_abs_step_conf": 0.7692466974258423,
"adv/ratio_final_to_reasoning": 1.59725782661538,
"adv/ratio_step_to_reasoning": 1.7378020224282766,
"adv/std_final_conf": 0.891591489315033,
"adv/std_reasoning": 0.7205852270126343,
"adv/std_step_conf": 0.9350951313972473,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7420948616600789,
"calib/avg_num_step_conf": 4.890625,
"calib/ece": 0.16842105263157894,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.24696356275303644,
"calib/gap": 0.34510408432147566,
"calib/mean_conf": 0.4353846153846154,
"calib/mu_c": 0.5960606060606061,
"calib/mu_w": 0.2509565217391304,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.034696356275303666,
"calib/std_conf": 0.3724333946459744,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5536338259441708,
"calib/step_q_c_n": 609.0,
"calib/step_q_gap": 0.07437566109191568,
"calib/step_q_w": 0.4792581648522551,
"calib/step_q_w_n": 643.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1949.0,
"completions/max_terminated_length": 1949.0,
"completions/mean_length": 486.6796875,
"completions/mean_terminated_length": 486.6796875,
"completions/min_length": 171.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.07893333333333333,
"grad_norm": 3.0365121364593506,
"kl": 11.238975524902344,
"learning_rate": 3.5e-06,
"loss": 0.2472,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.035113103687763214,
"mask/share_reasoning": 0.8513389825820923,
"mask/share_step_conf": 0.11354796588420868,
"num_tokens": 17518533.0,
"reward": 1.1350901126861572,
"reward_std": 0.1803215742111206,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7471804618835449,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8179372549057007,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.692913830280304,
"adv/mean_abs_reasoning": 0.3745307922363281,
"adv/mean_abs_step_conf": 0.7376118898391724,
"adv/ratio_final_to_reasoning": 1.8500850788339902,
"adv/ratio_step_to_reasoning": 1.9694292301972887,
"adv/std_final_conf": 0.871324360370636,
"adv/std_reasoning": 0.6613178849220276,
"adv/std_step_conf": 0.9350234866142273,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.841669758812616,
"calib/avg_num_step_conf": 5.140625,
"calib/ece": 0.11362817460317463,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.4880952380952381,
"calib/gap": 0.49556233766233765,
"calib/mean_conf": 0.6377210317460317,
"calib/mu_c": 0.7891428571428571,
"calib/mu_w": 0.2935805194805195,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.02845238095238099,
"calib/std_conf": 0.3893535529573639,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5927868852459016,
"calib/step_q_c_n": 915.0,
"calib/step_q_gap": 0.11303243803060653,
"calib/step_q_w": 0.4797544472152951,
"calib/step_q_w_n": 401.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2452.0,
"completions/max_terminated_length": 2452.0,
"completions/mean_length": 435.890625,
"completions/mean_terminated_length": 437.60003662109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.08,
"grad_norm": 0.08381512761116028,
"kl": 0.0576171875,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.0208,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.036993805319070816,
"mask/share_reasoning": 0.8341166973114014,
"mask/share_step_conf": 0.1249832808971405,
"num_tokens": 17734873.0,
"reward": 1.2211979627609253,
"reward_std": 0.16637729108333588,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.8301264047622681,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8524503707885742,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.6946251392364502,
"adv/mean_abs_reasoning": 0.46490082144737244,
"adv/mean_abs_step_conf": 0.7597237825393677,
"adv/ratio_final_to_reasoning": 1.4941361838722476,
"adv/ratio_step_to_reasoning": 1.6341631322012402,
"adv/std_final_conf": 0.9021607637405396,
"adv/std_reasoning": 0.7573848962783813,
"adv/std_step_conf": 0.9354786276817322,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.719203268641471,
"calib/avg_num_step_conf": 4.7109375,
"calib/ece": 0.23349081364829408,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.38976377952755903,
"calib/gap": 0.3165533991601407,
"calib/mean_conf": 0.5485564304461943,
"calib/mu_c": 0.6594747474747474,
"calib/mu_w": 0.34292134831460674,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.06622047244094494,
"calib/std_conf": 0.3989292515947965,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5832443257676903,
"calib/step_q_c_n": 749.0,
"calib/step_q_gap": 0.08530121854668371,
"calib/step_q_w": 0.49794310722100654,
"calib/step_q_w_n": 457.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2703.0,
"completions/max_terminated_length": 2703.0,
"completions/mean_length": 465.90625,
"completions/mean_terminated_length": 465.90625,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.1563568115234375,
"kl": 0.28119659423828125,
"learning_rate": 3.444444444444445e-06,
"loss": 0.0588,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03585202246904373,
"mask/share_reasoning": 0.8539218306541443,
"mask/share_step_conf": 0.11022613197565079,
"num_tokens": 17957201.0,
"reward": 1.1630396842956543,
"reward_std": 0.18635006248950958,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7373994588851929,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8414114713668823,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.7443857192993164,
"adv/mean_abs_reasoning": 0.45837777853012085,
"adv/mean_abs_step_conf": 0.7630868554115295,
"adv/ratio_final_to_reasoning": 1.6239568193867004,
"adv/ratio_step_to_reasoning": 1.66475534188965,
"adv/std_final_conf": 0.9162412285804749,
"adv/std_reasoning": 0.7015013694763184,
"adv/std_step_conf": 0.9357102513313293,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6684374999999999,
"calib/avg_num_step_conf": 4.80078125,
"calib/ece": 0.24876,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.34,
"calib/gap": 0.24712500000000004,
"calib/mean_conf": 0.5291600000000001,
"calib/mu_c": 0.618125,
"calib/mu_w": 0.371,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.06896000000000001,
"calib/std_conf": 0.3863020766188036,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.555174978127734,
"calib/step_q_c_n": 762.0,
"calib/step_q_gap": 0.06198439996927574,
"calib/step_q_w": 0.49319057815845824,
"calib/step_q_w_n": 467.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2508.0,
"completions/max_terminated_length": 2508.0,
"completions/mean_length": 453.71875,
"completions/mean_terminated_length": 453.71875,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.07061842828989029,
"kl": 0.0658111572265625,
"learning_rate": 3.416666666666667e-06,
"loss": -0.0496,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03902812302112579,
"mask/share_reasoning": 0.8365130424499512,
"mask/share_step_conf": 0.12445880472660065,
"num_tokens": 18178017.0,
"reward": 1.1348567008972168,
"reward_std": 0.20125140249729156,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7050395011901855,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8295742273330688,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.70162034034729,
"adv/mean_abs_reasoning": 0.47509390115737915,
"adv/mean_abs_step_conf": 0.7606037855148315,
"adv/ratio_final_to_reasoning": 1.4768035090285698,
"adv/ratio_step_to_reasoning": 1.6009546400446732,
"adv/std_final_conf": 0.8772461414337158,
"adv/std_reasoning": 0.7392054200172424,
"adv/std_step_conf": 0.9353312849998474,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7230111851200202,
"calib/avg_num_step_conf": 5.41015625,
"calib/ece": 0.2087843137254901,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.4549019607843137,
"calib/gap": 0.3062146537639814,
"calib/mean_conf": 0.6443137254901962,
"calib/mu_c": 0.7752054794520548,
"calib/mu_w": 0.4689908256880734,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14027450980392148,
"calib/std_conf": 0.38189750653530774,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5682592592592592,
"calib/step_q_c_n": 810.0,
"calib/step_q_gap": 0.07429404186795496,
"calib/step_q_w": 0.4939652173913043,
"calib/step_q_w_n": 575.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1422.0,
"completions/max_terminated_length": 1422.0,
"completions/mean_length": 506.15234375,
"completions/mean_terminated_length": 508.1372985839844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.0832,
"grad_norm": 0.05703939124941826,
"kl": 0.07027435302734375,
"learning_rate": 3.3888888888888893e-06,
"loss": 0.0122,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03177928924560547,
"mask/share_reasoning": 0.8523037433624268,
"mask/share_step_conf": 0.11201069504022598,
"num_tokens": 18415616.0,
"reward": 1.1688693761825562,
"reward_std": 0.17837125062942505,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7512054443359375,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8488346338272095,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.6945815086364746,
"adv/mean_abs_reasoning": 0.4822331368923187,
"adv/mean_abs_step_conf": 0.7536656260490417,
"adv/ratio_final_to_reasoning": 1.4403437995003912,
"adv/ratio_step_to_reasoning": 1.562865693771959,
"adv/std_final_conf": 0.8733789324760437,
"adv/std_reasoning": 0.7392861247062683,
"adv/std_step_conf": 0.9356251955032349,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7021109976166156,
"calib/avg_num_step_conf": 5.32421875,
"calib/ece": 0.20346456692913395,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5905511811023622,
"calib/gap": 0.2385195778004766,
"calib/mean_conf": 0.7600000000000001,
"calib/mu_c": 0.8435757575757575,
"calib/mu_w": 0.6050561797752809,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15692913385826784,
"calib/std_conf": 0.33442158314856196,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.601807182320442,
"calib/step_q_c_n": 905.0,
"calib/step_q_gap": 0.058073557866293535,
"calib/step_q_w": 0.5437336244541484,
"calib/step_q_w_n": 458.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1659.0,
"completions/max_terminated_length": 1659.0,
"completions/mean_length": 497.23828125,
"completions/mean_terminated_length": 499.1882629394531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.040150903165340424,
"kl": 0.06087493896484375,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.079,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.033050116151571274,
"mask/share_reasoning": 0.8556927442550659,
"mask/share_step_conf": 0.1073509156703949,
"num_tokens": 18649285.0,
"reward": 1.1646015644073486,
"reward_std": 0.22102072834968567,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.74712073802948,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8370131254196167,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.6892921924591064,
"adv/mean_abs_reasoning": 0.5974498987197876,
"adv/mean_abs_step_conf": 0.7859007120132446,
"adv/ratio_final_to_reasoning": 1.1537238418419988,
"adv/ratio_step_to_reasoning": 1.3154252995895863,
"adv/std_final_conf": 0.8678206205368042,
"adv/std_reasoning": 0.8266484141349792,
"adv/std_step_conf": 0.9356747269630432,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6667343073593074,
"calib/avg_num_step_conf": 5.86328125,
"calib/ece": 0.2576,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.756,
"calib/gap": 0.20345643939393931,
"calib/mean_conf": 0.8596,
"calib/mu_c": 0.9377272727272727,
"calib/mu_w": 0.7342708333333334,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2506,
"calib/std_conf": 0.2794377211473068,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5855282555282555,
"calib/step_q_c_n": 814.0,
"calib/step_q_gap": 0.06464033704208383,
"calib/step_q_w": 0.5208879184861717,
"calib/step_q_w_n": 687.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2390.0,
"completions/max_terminated_length": 2390.0,
"completions/mean_length": 474.1015625,
"completions/mean_terminated_length": 474.1015625,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.04135843738913536,
"kl": 0.078033447265625,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.0204,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03415698558092117,
"mask/share_reasoning": 0.8395260572433472,
"mask/share_step_conf": 0.12631690502166748,
"num_tokens": 18872815.0,
"reward": 1.1109731197357178,
"reward_std": 0.27404117584228516,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7014476656913757,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8037697076797485,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.6472058296203613,
"adv/mean_abs_reasoning": 0.5512617826461792,
"adv/mean_abs_step_conf": 0.7223784923553467,
"adv/ratio_final_to_reasoning": 1.174044437678283,
"adv/ratio_step_to_reasoning": 1.310409165111663,
"adv/std_final_conf": 0.8557976484298706,
"adv/std_reasoning": 0.7928785681724548,
"adv/std_step_conf": 0.9358363151550293,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7318347953216374,
"calib/avg_num_step_conf": 5.203125,
"calib/ece": 0.20165289256198346,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.731404958677686,
"calib/gap": 0.3349956140350877,
"calib/mean_conf": 0.8207438016528926,
"calib/mu_c": 0.945328947368421,
"calib/mu_w": 0.6103333333333333,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.19714876033057852,
"calib/std_conf": 0.31975627414997615,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6455555555555557,
"calib/step_q_c_n": 612.0,
"calib/step_q_gap": 0.21428990882398408,
"calib/step_q_w": 0.4312656467315716,
"calib/step_q_w_n": 719.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3024.0,
"completions/max_terminated_length": 3024.0,
"completions/mean_length": 502.4921875,
"completions/mean_terminated_length": 506.4488220214844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.0864,
"grad_norm": 0.02329258807003498,
"kl": 0.07903289794921875,
"learning_rate": 3.3055555555555558e-06,
"loss": 0.0445,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.034032806754112244,
"mask/share_reasoning": 0.8530516624450684,
"mask/share_step_conf": 0.1051030158996582,
"num_tokens": 19107701.0,
"reward": 1.1032999753952026,
"reward_std": 0.30462607741355896,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7289878726005554,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.781428873538971,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.6715503931045532,
"adv/mean_abs_reasoning": 0.6163297891616821,
"adv/mean_abs_step_conf": 0.7347931861877441,
"adv/ratio_final_to_reasoning": 1.0895958704478343,
"adv/ratio_step_to_reasoning": 1.192207806776942,
"adv/std_final_conf": 0.8576227426528931,
"adv/std_reasoning": 0.8266252875328064,
"adv/std_step_conf": 0.9357723593711853,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7495399858457183,
"calib/avg_num_step_conf": 4.40234375,
"calib/ece": 0.2589473684210527,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.7611336032388664,
"calib/gap": 0.23805944798301482,
"calib/mean_conf": 0.8519838056680162,
"calib/mu_c": 0.9387261146496815,
"calib/mu_w": 0.7006666666666667,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2376518218623483,
"calib/std_conf": 0.2900099672670955,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6533693415637861,
"calib/step_q_c_n": 648.0,
"calib/step_q_gap": 0.09132341254499687,
"calib/step_q_w": 0.5620459290187892,
"calib/step_q_w_n": 479.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1323.0,
"completions/max_terminated_length": 1323.0,
"completions/mean_length": 428.57421875,
"completions/mean_terminated_length": 430.2549133300781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.03629959747195244,
"kl": 0.09059906005859375,
"learning_rate": 3.277777777777778e-06,
"loss": -0.0165,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03759324923157692,
"mask/share_reasoning": 0.8498210310935974,
"mask/share_step_conf": 0.10867946594953537,
"num_tokens": 19322968.0,
"reward": 1.1178183555603027,
"reward_std": 0.3032802641391754,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7175562381744385,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8021576404571533,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.6760067939758301,
"adv/mean_abs_reasoning": 0.5050960183143616,
"adv/mean_abs_step_conf": 0.7669603228569031,
"adv/ratio_final_to_reasoning": 1.3383728429137944,
"adv/ratio_step_to_reasoning": 1.5184446027043563,
"adv/std_final_conf": 0.8897445797920227,
"adv/std_reasoning": 0.757703423500061,
"adv/std_step_conf": 0.9363173842430115,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.7130994989262704,
"calib/avg_num_step_conf": 4.45703125,
"calib/ece": 0.3174964838255978,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.8984375,
"calib/frac_conf_gt_0.9": 0.7426160337552743,
"calib/gap": 0.24436912431400637,
"calib/mean_conf": 0.8375246132208157,
"calib/mu_c": 0.9509448818897638,
"calib/mu_w": 0.7065757575757574,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.30957805907172997,
"calib/std_conf": 0.30996841328257,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.6469359375,
"calib/step_q_c_n": 512.0,
"calib/step_q_gap": 0.13972131110890296,
"calib/step_q_w": 0.507214626391097,
"calib/step_q_w_n": 629.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2764.0,
"completions/max_terminated_length": 2764.0,
"completions/mean_length": 517.15234375,
"completions/mean_terminated_length": 521.2244262695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.03925936296582222,
"kl": 0.0845184326171875,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.0406,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.032422006130218506,
"mask/share_reasoning": 0.8642917275428772,
"mask/share_step_conf": 0.0954737514257431,
"num_tokens": 19562623.0,
"reward": 1.004056692123413,
"reward_std": 0.3257407248020172,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6180038452148438,
"rewards/format_reward_step": 0.8984375,
"rewards/step_l2_reward": 0.7408022284507751,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.7383853793144226,
"adv/mean_abs_reasoning": 0.5751377940177917,
"adv/mean_abs_step_conf": 0.7391847968101501,
"adv/ratio_final_to_reasoning": 1.2838408238766879,
"adv/ratio_step_to_reasoning": 1.285230782081561,
"adv/std_final_conf": 0.9037018418312073,
"adv/std_reasoning": 0.8099603652954102,
"adv/std_step_conf": 0.9362303614616394,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7144144144144143,
"calib/avg_num_step_conf": 3.8203125,
"calib/ece": 0.3360975609756096,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.7845528455284553,
"calib/gap": 0.16928528528528552,
"calib/mean_conf": 0.8765040650406504,
"calib/mu_c": 0.952888888888889,
"calib/mu_w": 0.7836036036036035,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.33191056910569094,
"calib/std_conf": 0.2616069283157941,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.616244966442953,
"calib/step_q_c_n": 596.0,
"calib/step_q_gap": 0.024098369584314328,
"calib/step_q_w": 0.5921465968586387,
"calib/step_q_w_n": 382.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1611.0,
"completions/max_terminated_length": 1611.0,
"completions/mean_length": 427.90234375,
"completions/mean_terminated_length": 429.5804138183594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0896,
"grad_norm": 0.032014407217502594,
"kl": 0.098663330078125,
"learning_rate": 3.2222222222222227e-06,
"loss": -0.0648,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03773626685142517,
"mask/share_reasoning": 0.8612334728240967,
"mask/share_step_conf": 0.09712400287389755,
"num_tokens": 19778086.0,
"reward": 1.0297787189483643,
"reward_std": 0.3220594525337219,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6267675161361694,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.758318305015564,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.7482867240905762,
"adv/mean_abs_reasoning": 0.6304100751876831,
"adv/mean_abs_step_conf": 0.7630487680435181,
"adv/ratio_final_to_reasoning": 1.1869840815405739,
"adv/ratio_step_to_reasoning": 1.2104006551867788,
"adv/std_final_conf": 0.9141106009483337,
"adv/std_reasoning": 0.8590542674064636,
"adv/std_step_conf": 0.9360108971595764,
"calib/answer_extract_rate": 0.921875,
"calib/auroc": 0.6614173228346456,
"calib/avg_num_step_conf": 4.203125,
"calib/ece": 0.3278439716312057,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 0.7702127659574468,
"calib/gap": 0.22586905803441248,
"calib/mean_conf": 0.8366950354609929,
"calib/mu_c": 0.9404986876640421,
"calib/mu_w": 0.7146296296296296,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.31205673758865254,
"calib/std_conf": 0.30288713054935484,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.6443396226415095,
"calib/step_q_c_n": 477.0,
"calib/step_q_gap": 0.13430623365987338,
"calib/step_q_w": 0.5100333889816361,
"calib/step_q_w_n": 599.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2379.0,
"completions/max_terminated_length": 2379.0,
"completions/mean_length": 524.30859375,
"completions/mean_terminated_length": 526.36474609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.03913387656211853,
"kl": 0.08313751220703125,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.123,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.033390868455171585,
"mask/share_reasoning": 0.8677021265029907,
"mask/share_step_conf": 0.09500078111886978,
"num_tokens": 20020133.0,
"reward": 1.0035605430603027,
"reward_std": 0.34797096252441406,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6201419830322266,
"rewards/format_reward_step": 0.90234375,
"rewards/step_l2_reward": 0.7381943464279175,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.7321887612342834,
"adv/mean_abs_reasoning": 0.5524415969848633,
"adv/mean_abs_step_conf": 0.7457665801048279,
"adv/ratio_final_to_reasoning": 1.3253686276168397,
"adv/ratio_step_to_reasoning": 1.349946463436318,
"adv/std_final_conf": 0.9024088978767395,
"adv/std_reasoning": 0.8267043828964233,
"adv/std_step_conf": 0.936254620552063,
"calib/answer_extract_rate": 0.90625,
"calib/auroc": 0.6933198380566802,
"calib/avg_num_step_conf": 3.41796875,
"calib/ece": 0.3585714285714286,
"calib/final_conf_rate": 0.90234375,
"calib/format_rate": 0.87890625,
"calib/frac_conf_gt_0.9": 0.7012987012987013,
"calib/gap": 0.16702654071075118,
"calib/mean_conf": 0.8194372294372294,
"calib/mu_c": 0.9040350877192983,
"calib/mu_w": 0.7370085470085471,
"calib/nonempty_final_conf_rate": 0.90234375,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.90234375,
"calib/pce": 0.34225108225108225,
"calib/std_conf": 0.30601493073827,
"calib/step_conf_rate": 0.90234375,
"calib/step_q_c": 0.6428678304239402,
"calib/step_q_c_n": 401.0,
"calib/step_q_gap": 0.1263699401285816,
"calib/step_q_w": 0.5164978902953586,
"calib/step_q_w_n": 474.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2842.0,
"completions/max_terminated_length": 2842.0,
"completions/mean_length": 487.5078125,
"completions/mean_terminated_length": 489.4196472167969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.045741576701402664,
"kl": 0.0944671630859375,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.1206,
"mask/has_final_conf_rate": 0.90234375,
"mask/share_final_conf": 0.03539574146270752,
"mask/share_reasoning": 0.8742038607597351,
"mask/share_step_conf": 0.08649415522813797,
"num_tokens": 20250447.0,
"reward": 0.9497416019439697,
"reward_std": 0.3561251163482666,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.5509187579154968,
"rewards/format_reward_step": 0.87890625,
"rewards/step_l2_reward": 0.7219595909118652,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.6731104850769043,
"adv/mean_abs_reasoning": 0.5100500583648682,
"adv/mean_abs_step_conf": 0.7500249147415161,
"adv/ratio_final_to_reasoning": 1.3196949476582347,
"adv/ratio_step_to_reasoning": 1.4704927534876981,
"adv/std_final_conf": 0.8464348316192627,
"adv/std_reasoning": 0.7577330470085144,
"adv/std_step_conf": 0.9360089302062988,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6721701621300016,
"calib/avg_num_step_conf": 3.375,
"calib/ece": 0.2669795918367348,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.8,
"calib/gap": 0.15429619713421605,
"calib/mean_conf": 0.8810204081632654,
"calib/mu_c": 0.933292181069959,
"calib/mu_w": 0.778995983935743,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.2433877551020409,
"calib/std_conf": 0.25129036973915836,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.6871928166351606,
"calib/step_q_c_n": 529.0,
"calib/step_q_gap": 0.08473510519237448,
"calib/step_q_w": 0.6024577114427861,
"calib/step_q_w_n": 335.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2417.0,
"completions/max_terminated_length": 2417.0,
"completions/mean_length": 423.375,
"completions/mean_terminated_length": 425.0353088378906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.0928,
"grad_norm": 0.034470539540052414,
"kl": 0.1080780029296875,
"learning_rate": 3.138888888888889e-06,
"loss": 0.0384,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.04032812640070915,
"mask/share_reasoning": 0.8646848201751709,
"mask/share_step_conf": 0.09108079969882965,
"num_tokens": 20464327.0,
"reward": 1.0932562351226807,
"reward_std": 0.292274534702301,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6880762577056885,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.7885407209396362,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.6974610090255737,
"adv/mean_abs_reasoning": 0.5662336349487305,
"adv/mean_abs_step_conf": 0.7676079273223877,
"adv/ratio_final_to_reasoning": 1.2317548198787682,
"adv/ratio_step_to_reasoning": 1.3556381676123683,
"adv/std_final_conf": 0.8608344793319702,
"adv/std_reasoning": 0.8100785613059998,
"adv/std_step_conf": 0.9360609650611877,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.7919852941176472,
"calib/avg_num_step_conf": 3.6875,
"calib/ece": 0.25492937853107345,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.7161016949152542,
"calib/gap": 0.3310803921568626,
"calib/mean_conf": 0.8156920903954802,
"calib/mu_c": 0.9559803921568627,
"calib/mu_w": 0.6249000000000001,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.9296875,
"calib/pce": 0.24717514124293788,
"calib/std_conf": 0.3247186835694789,
"calib/step_conf_rate": 0.9296875,
"calib/step_q_c": 0.6507528957528959,
"calib/step_q_c_n": 518.0,
"calib/step_q_gap": 0.10406275490782546,
"calib/step_q_w": 0.5466901408450704,
"calib/step_q_w_n": 426.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2452.0,
"completions/max_terminated_length": 2452.0,
"completions/mean_length": 490.2421875,
"completions/mean_terminated_length": 492.16473388671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.041649460792541504,
"kl": 0.0970458984375,
"learning_rate": 3.1111111111111116e-06,
"loss": -0.0827,
"mask/has_final_conf_rate": 0.921875,
"mask/share_final_conf": 0.033657241612672806,
"mask/share_reasoning": 0.8734369874000549,
"mask/share_step_conf": 0.08899955451488495,
"num_tokens": 20699677.0,
"reward": 1.053786039352417,
"reward_std": 0.3305705785751343,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6846892833709717,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l2_reward": 0.7558801174163818,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.7344450950622559,
"adv/mean_abs_reasoning": 0.635488748550415,
"adv/mean_abs_step_conf": 0.7786235213279724,
"adv/ratio_final_to_reasoning": 1.155716913537125,
"adv/ratio_step_to_reasoning": 1.2252357309300215,
"adv/std_final_conf": 0.9001230597496033,
"adv/std_reasoning": 0.8431754112243652,
"adv/std_step_conf": 0.936394214630127,
"calib/answer_extract_rate": 0.91796875,
"calib/auroc": 0.7706845238095239,
"calib/avg_num_step_conf": 3.35546875,
"calib/ece": 0.2964367816091954,
"calib/final_conf_rate": 0.90625,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.5775862068965517,
"calib/gap": 0.26714285714285724,
"calib/mean_conf": 0.7518678160919541,
"calib/mu_c": 0.8808333333333335,
"calib/mu_w": 0.6136904761904762,
"calib/nonempty_final_conf_rate": 0.90625,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/pce": 0.2655316091954023,
"calib/std_conf": 0.34185410860418225,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_c": 0.6363118811881188,
"calib/step_q_c_n": 404.0,
"calib/step_q_gap": 0.1137140789903166,
"calib/step_q_w": 0.5225978021978022,
"calib/step_q_w_n": 455.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2357.0,
"completions/max_terminated_length": 2357.0,
"completions/mean_length": 495.140625,
"completions/mean_terminated_length": 497.0823974609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.04524797201156616,
"kl": 0.1009368896484375,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.1369,
"mask/has_final_conf_rate": 0.90625,
"mask/share_final_conf": 0.035608772188425064,
"mask/share_reasoning": 0.8790637254714966,
"mask/share_step_conf": 0.08142121136188507,
"num_tokens": 20935321.0,
"reward": 0.9926495552062988,
"reward_std": 0.3590254783630371,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6327042579650879,
"rewards/format_reward_step": 0.890625,
"rewards/step_l2_reward": 0.7194381952285767,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.7435449361801147,
"adv/mean_abs_reasoning": 0.5475056171417236,
"adv/mean_abs_step_conf": 0.7501516342163086,
"adv/ratio_final_to_reasoning": 1.3580590096259153,
"adv/ratio_step_to_reasoning": 1.3701259142006745,
"adv/std_final_conf": 0.894040048122406,
"adv/std_reasoning": 0.7929933071136475,
"adv/std_step_conf": 0.9362837076187134,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.7313868613138687,
"calib/avg_num_step_conf": 4.1640625,
"calib/ece": 0.2791701828410689,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 0.6919831223628692,
"calib/gap": 0.21087469586374696,
"calib/mean_conf": 0.8173980309423348,
"calib/mu_c": 0.906374695863747,
"calib/mu_w": 0.6955,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.25925457102672295,
"calib/std_conf": 0.30748016228322245,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.5799717514124293,
"calib/step_q_c_n": 531.0,
"calib/step_q_gap": 0.05376614393579382,
"calib/step_q_w": 0.5262056074766355,
"calib/step_q_w_n": 535.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2470.0,
"completions/max_terminated_length": 2470.0,
"completions/mean_length": 492.58203125,
"completions/mean_terminated_length": 494.5137634277344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.096,
"grad_norm": 0.03284657001495361,
"kl": 0.10390472412109375,
"learning_rate": 3.055555555555556e-06,
"loss": -0.098,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.0350940003991127,
"mask/share_reasoning": 0.8628687858581543,
"mask/share_step_conf": 0.09813091158866882,
"num_tokens": 21164742.0,
"reward": 1.0324347019195557,
"reward_std": 0.3419674038887024,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.642932653427124,
"rewards/format_reward_step": 0.9140625,
"rewards/step_l2_reward": 0.7542078495025635,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.7523043751716614,
"adv/mean_abs_reasoning": 0.6084607839584351,
"adv/mean_abs_step_conf": 0.7678749561309814,
"adv/ratio_final_to_reasoning": 1.2364056895785949,
"adv/ratio_step_to_reasoning": 1.261995803797663,
"adv/std_final_conf": 0.9162842035293579,
"adv/std_reasoning": 0.8269948363304138,
"adv/std_step_conf": 0.9362143278121948,
"calib/answer_extract_rate": 0.89453125,
"calib/auroc": 0.6887827170751973,
"calib/avg_num_step_conf": 3.7890625,
"calib/ece": 0.27266081871345027,
"calib/final_conf_rate": 0.890625,
"calib/format_rate": 0.87890625,
"calib/frac_conf_gt_0.9": 0.7017543859649122,
"calib/gap": 0.1607998892120206,
"calib/mean_conf": 0.8428654970760234,
"calib/mu_c": 0.9014022988505748,
"calib/mu_w": 0.7406024096385542,
"calib/nonempty_final_conf_rate": 0.890625,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.23978070175438596,
"calib/std_conf": 0.2834935485075731,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_c": 0.5960701107011072,
"calib/step_q_c_n": 542.0,
"calib/step_q_gap": 0.1316775873366211,
"calib/step_q_w": 0.46439252336448605,
"calib/step_q_w_n": 428.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1640.0,
"completions/max_terminated_length": 1640.0,
"completions/mean_length": 489.32421875,
"completions/mean_terminated_length": 491.2431640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.04172343760728836,
"kl": 0.1055908203125,
"learning_rate": 3.0277777777777776e-06,
"loss": -0.1185,
"mask/has_final_conf_rate": 0.890625,
"mask/share_final_conf": 0.03135697543621063,
"mask/share_reasoning": 0.8781741857528687,
"mask/share_step_conf": 0.08656258881092072,
"num_tokens": 21397721.0,
"reward": 1.0270675420761108,
"reward_std": 0.3646523356437683,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6375107765197754,
"rewards/format_reward_step": 0.87890625,
"rewards/step_l2_reward": 0.7517077922821045,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.7131840586662292,
"adv/mean_abs_reasoning": 0.5634198188781738,
"adv/mean_abs_step_conf": 0.7377828359603882,
"adv/ratio_final_to_reasoning": 1.2658128712728836,
"adv/ratio_step_to_reasoning": 1.3094726369927647,
"adv/std_final_conf": 0.8928073644638062,
"adv/std_reasoning": 0.8266968727111816,
"adv/std_step_conf": 0.9362941384315491,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7849395077179808,
"calib/avg_num_step_conf": 3.14453125,
"calib/ece": 0.22082321187584347,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.659919028340081,
"calib/gap": 0.26598688175033613,
"calib/mean_conf": 0.7991228070175438,
"calib/mu_c": 0.9003485838779957,
"calib/mu_w": 0.6343617021276595,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.20025641025641028,
"calib/std_conf": 0.3049184381507363,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.6151875,
"calib/step_q_c_n": 480.0,
"calib/step_q_gap": 0.06992596153846165,
"calib/step_q_w": 0.5452615384615384,
"calib/step_q_w_n": 325.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1170.0,
"completions/max_terminated_length": 1170.0,
"completions/mean_length": 408.37890625,
"completions/mean_terminated_length": 409.98040771484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.0428118035197258,
"kl": 0.1136627197265625,
"learning_rate": 3e-06,
"loss": -0.1626,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03909327834844589,
"mask/share_reasoning": 0.87115478515625,
"mask/share_step_conf": 0.0858457013964653,
"num_tokens": 21608986.0,
"reward": 1.0958373546600342,
"reward_std": 0.3293219208717346,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.717007040977478,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.7784241437911987,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.7348818778991699,
"adv/mean_abs_reasoning": 0.5190085172653198,
"adv/mean_abs_step_conf": 0.7640940546989441,
"adv/ratio_final_to_reasoning": 1.4159341387522828,
"adv/ratio_step_to_reasoning": 1.4722187195019292,
"adv/std_final_conf": 0.8839680552482605,
"adv/std_reasoning": 0.7756069898605347,
"adv/std_step_conf": 0.9361538290977478,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.6999163821336492,
"calib/avg_num_step_conf": 3.984375,
"calib/ece": 0.26783333333333337,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.5541666666666667,
"calib/gap": 0.23909553341230583,
"calib/mean_conf": 0.7275833333333334,
"calib/mu_c": 0.8401574803149606,
"calib/mu_w": 0.6010619469026548,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.23312500000000003,
"calib/std_conf": 0.3476324587677176,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_c": 0.5878935698447894,
"calib/step_q_c_n": 451.0,
"calib/step_q_gap": 0.15009040640014965,
"calib/step_q_w": 0.4378031634446397,
"calib/step_q_w_n": 569.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2115.0,
"completions/max_terminated_length": 2115.0,
"completions/mean_length": 474.6640625,
"completions/mean_terminated_length": 476.5255126953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.0992,
"grad_norm": 0.027074242010712624,
"kl": 0.103118896484375,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.0436,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.03518085181713104,
"mask/share_reasoning": 0.8667430877685547,
"mask/share_step_conf": 0.09416983276605606,
"num_tokens": 21836276.0,
"reward": 1.0502357482910156,
"reward_std": 0.32360124588012695,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6509683728218079,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.7772729396820068,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.7581357955932617,
"adv/mean_abs_reasoning": 0.5422264337539673,
"adv/mean_abs_step_conf": 0.7568789720535278,
"adv/ratio_final_to_reasoning": 1.3981904023831901,
"adv/ratio_step_to_reasoning": 1.3958725081207644,
"adv/std_final_conf": 0.9000263214111328,
"adv/std_reasoning": 0.7930181622505188,
"adv/std_step_conf": 0.9360536336898804,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.711711711711712,
"calib/avg_num_step_conf": 3.6171875,
"calib/ece": 0.2289711934156379,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.4691358024691358,
"calib/gap": 0.2531674856674858,
"calib/mean_conf": 0.6909465020576131,
"calib/mu_c": 0.8065909090909091,
"calib/mu_w": 0.5534234234234233,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.18835390946502062,
"calib/std_conf": 0.3527766538595497,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.5513870246085012,
"calib/step_q_c_n": 447.0,
"calib/step_q_gap": 0.07401750477551577,
"calib/step_q_w": 0.4773695198329854,
"calib/step_q_w_n": 479.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2849.0,
"completions/max_terminated_length": 2849.0,
"completions/mean_length": 464.6875,
"completions/mean_terminated_length": 466.50982666015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.041096802800893784,
"kl": 0.11529541015625,
"learning_rate": 2.944444444444445e-06,
"loss": -0.1382,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.03776441887021065,
"mask/share_reasoning": 0.8714349865913391,
"mask/share_step_conf": 0.08689434826374054,
"num_tokens": 22063916.0,
"reward": 1.0612201690673828,
"reward_std": 0.31181395053863525,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6851406097412109,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.7654915452003479,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.6974964141845703,
"adv/mean_abs_reasoning": 0.4942771792411804,
"adv/mean_abs_step_conf": 0.7726743221282959,
"adv/ratio_final_to_reasoning": 1.4111442799268503,
"adv/ratio_step_to_reasoning": 1.5632409396576101,
"adv/std_final_conf": 0.8614376783370972,
"adv/std_reasoning": 0.7395118474960327,
"adv/std_step_conf": 0.9359301924705505,
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.8170363663799607,
"calib/avg_num_step_conf": 3.82421875,
"calib/ece": 0.1715319148936169,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.87890625,
"calib/frac_conf_gt_0.9": 0.5829787234042553,
"calib/gap": 0.35925531914893627,
"calib/mean_conf": 0.7333191489361703,
"calib/mu_c": 0.8770212765957447,
"calib/mu_w": 0.5177659574468084,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 0.94140625,
"calib/nonempty_step_conf_rate": 0.89453125,
"calib/pce": 0.1524255319148935,
"calib/std_conf": 0.3477918860702431,
"calib/step_conf_rate": 0.89453125,
"calib/step_q_c": 0.5590125391849531,
"calib/step_q_c_n": 638.0,
"calib/step_q_gap": 0.10220902012336958,
"calib/step_q_w": 0.4568035190615835,
"calib/step_q_w_n": 341.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1924.0,
"completions/max_terminated_length": 1924.0,
"completions/mean_length": 467.44921875,
"completions/mean_terminated_length": 469.2823791503906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.03094375506043434,
"kl": 0.10405731201171875,
"learning_rate": 2.916666666666667e-06,
"loss": -0.1834,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.0356331393122673,
"mask/share_reasoning": 0.8669276833534241,
"mask/share_step_conf": 0.09353290498256683,
"num_tokens": 22289711.0,
"reward": 1.0407851934432983,
"reward_std": 0.33165183663368225,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7023831605911255,
"rewards/format_reward_step": 0.87890625,
"rewards/step_l2_reward": 0.727791428565979,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.7360479831695557,
"adv/mean_abs_reasoning": 0.5627431869506836,
"adv/mean_abs_step_conf": 0.7364088892936707,
"adv/ratio_final_to_reasoning": 1.307964272580451,
"adv/ratio_step_to_reasoning": 1.308605606198492,
"adv/std_final_conf": 0.903143584728241,
"adv/std_reasoning": 0.8101238012313843,
"adv/std_step_conf": 0.9362661242485046,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.8285256410256411,
"calib/avg_num_step_conf": 4.0390625,
"calib/ece": 0.13099173553719007,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.6115702479338843,
"calib/gap": 0.4288267740011925,
"calib/mean_conf": 0.7756198347107438,
"calib/mu_c": 0.9280128205128205,
"calib/mu_w": 0.499186046511628,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.13099173553719007,
"calib/std_conf": 0.330956271766317,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5867115384615385,
"calib/step_q_c_n": 520.0,
"calib/step_q_gap": 0.1595909158934451,
"calib/step_q_w": 0.42712062256809336,
"calib/step_q_w_n": 514.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2613.0,
"completions/max_terminated_length": 2613.0,
"completions/mean_length": 421.73828125,
"completions/mean_terminated_length": 421.73828125,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.1024,
"grad_norm": 0.031053684651851654,
"kl": 0.12139892578125,
"learning_rate": 2.888888888888889e-06,
"loss": 0.0149,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.039800334721803665,
"mask/share_reasoning": 0.8641690611839294,
"mask/share_step_conf": 0.0960305854678154,
"num_tokens": 22503492.0,
"reward": 1.1341462135314941,
"reward_std": 0.3485493063926697,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7765765190124512,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l2_reward": 0.7897896766662598,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.7563111186027527,
"adv/mean_abs_reasoning": 0.5842485427856445,
"adv/mean_abs_step_conf": 0.7468534111976624,
"adv/ratio_final_to_reasoning": 1.2945023619515237,
"adv/ratio_step_to_reasoning": 1.278314546813814,
"adv/std_final_conf": 0.9269362092018127,
"adv/std_reasoning": 0.8266255259513855,
"adv/std_step_conf": 0.9361647367477417,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.7186663922617823,
"calib/avg_num_step_conf": 4.36328125,
"calib/ece": 0.21413223140495874,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.4793388429752066,
"calib/gap": 0.29765109418947666,
"calib/mean_conf": 0.667603305785124,
"calib/mu_c": 0.8065891472868217,
"calib/mu_w": 0.5089380530973451,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.9375,
"calib/pce": 0.1743388429752067,
"calib/std_conf": 0.37392495997399144,
"calib/step_conf_rate": 0.9375,
"calib/step_q_c": 0.5463506261180681,
"calib/step_q_c_n": 559.0,
"calib/step_q_gap": 0.136171414648534,
"calib/step_q_w": 0.41017921146953407,
"calib/step_q_w_n": 558.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1998.0,
"completions/max_terminated_length": 1998.0,
"completions/mean_length": 444.62890625,
"completions/mean_terminated_length": 444.62890625,
"completions/min_length": 115.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.050117410719394684,
"kl": 0.1236724853515625,
"learning_rate": 2.861111111111111e-06,
"loss": -0.1227,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.03786986321210861,
"mask/share_reasoning": 0.8509825468063354,
"mask/share_step_conf": 0.11114759743213654,
"num_tokens": 22722389.0,
"reward": 1.0693061351776123,
"reward_std": 0.3145286738872528,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6830066442489624,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l2_reward": 0.780820369720459,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.7835785746574402,
"adv/mean_abs_reasoning": 0.6385958790779114,
"adv/mean_abs_step_conf": 0.7745162844657898,
"adv/ratio_final_to_reasoning": 1.227033559610303,
"adv/ratio_step_to_reasoning": 1.2128425970805483,
"adv/std_final_conf": 0.9290453791618347,
"adv/std_reasoning": 0.843262791633606,
"adv/std_step_conf": 0.9362633228302002,
"calib/answer_extract_rate": 0.91015625,
"calib/auroc": 0.6779699248120301,
"calib/avg_num_step_conf": 3.12109375,
"calib/ece": 0.2271101573676681,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.49356223175965663,
"calib/gap": 0.27745363408521306,
"calib/mean_conf": 0.6723748211731044,
"calib/mu_c": 0.7914536340852131,
"calib/mu_w": 0.514,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.16433476394849786,
"calib/std_conf": 0.37874272681994453,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_c": 0.572358276643991,
"calib/step_q_c_n": 441.0,
"calib/step_q_gap": 0.10626889116913057,
"calib/step_q_w": 0.4660893854748604,
"calib/step_q_w_n": 358.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2880.0,
"completions/max_terminated_length": 2880.0,
"completions/mean_length": 480.21484375,
"completions/mean_terminated_length": 482.0980529785156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.04808899015188217,
"kl": 0.10540771484375,
"learning_rate": 2.8333333333333335e-06,
"loss": -0.1129,
"mask/has_final_conf_rate": 0.91015625,
"mask/share_final_conf": 0.03742651268839836,
"mask/share_reasoning": 0.8740274310112,
"mask/share_step_conf": 0.08463980257511139,
"num_tokens": 22951508.0,
"reward": 1.0163490772247314,
"reward_std": 0.34398770332336426,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6527763605117798,
"rewards/format_reward_step": 0.890625,
"rewards/step_l2_reward": 0.7319269776344299,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.7636774778366089,
"adv/mean_abs_reasoning": 0.6109292507171631,
"adv/mean_abs_step_conf": 0.7603302597999573,
"adv/ratio_final_to_reasoning": 1.2500260495632454,
"adv/ratio_step_to_reasoning": 1.244547153221775,
"adv/std_final_conf": 0.9156864881515503,
"adv/std_reasoning": 0.843126118183136,
"adv/std_step_conf": 0.9363462328910828,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.6918441289559303,
"calib/avg_num_step_conf": 4.42578125,
"calib/ece": 0.22887005649717512,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.8984375,
"calib/frac_conf_gt_0.9": 0.288135593220339,
"calib/gap": 0.26150054224588376,
"calib/mean_conf": 0.48759887005649716,
"calib/mu_c": 0.6405102040816326,
"calib/mu_w": 0.37900966183574886,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/pce": 0.15060734463276837,
"calib/std_conf": 0.3836272235226863,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_c": 0.4688373655913979,
"calib/step_q_c_n": 496.0,
"calib/step_q_gap": 0.07384521488496143,
"calib/step_q_w": 0.39499215070643645,
"calib/step_q_w_n": 637.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2228.0,
"completions/max_terminated_length": 2228.0,
"completions/mean_length": 556.5703125,
"completions/mean_terminated_length": 558.7529907226562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.1056,
"grad_norm": 0.030870946124196053,
"kl": 0.1034698486328125,
"learning_rate": 2.805555555555556e-06,
"loss": -0.1052,
"mask/has_final_conf_rate": 0.921875,
"mask/share_final_conf": 0.031477123498916626,
"mask/share_reasoning": 0.875032901763916,
"mask/share_step_conf": 0.08958369493484497,
"num_tokens": 23199790.0,
"reward": 1.0112491846084595,
"reward_std": 0.3343077301979065,
"rewards/accuracy_reward_step": 0.390625,
"rewards/final_brier_reward_step": 0.6536698937416077,
"rewards/format_reward_step": 0.8984375,
"rewards/step_l2_reward": 0.7406773567199707,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.7110211849212646,
"adv/mean_abs_reasoning": 0.5518664121627808,
"adv/mean_abs_step_conf": 0.7303281426429749,
"adv/ratio_final_to_reasoning": 1.2883936569626544,
"adv/ratio_step_to_reasoning": 1.3233784962212092,
"adv/std_final_conf": 0.9070796966552734,
"adv/std_reasoning": 0.8099352717399597,
"adv/std_step_conf": 0.9358618855476379,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8162401574803149,
"calib/avg_num_step_conf": 4.3203125,
"calib/ece": 0.1366261808367072,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.39271255060728744,
"calib/gap": 0.4239192913385827,
"calib/mean_conf": 0.5830499325236167,
"calib/mu_c": 0.7890026246719161,
"calib/mu_w": 0.36508333333333337,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.10275303643724698,
"calib/std_conf": 0.3924053656183008,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.5050714285714285,
"calib/step_q_c_n": 560.0,
"calib/step_q_gap": 0.12179304029304028,
"calib/step_q_w": 0.3832783882783882,
"calib/step_q_w_n": 546.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2199.0,
"completions/max_terminated_length": 2199.0,
"completions/mean_length": 499.890625,
"completions/mean_terminated_length": 499.890625,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.03531169518828392,
"kl": 0.10375213623046875,
"learning_rate": 2.7777777777777783e-06,
"loss": -0.063,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.035181716084480286,
"mask/share_reasoning": 0.8687215447425842,
"mask/share_step_conf": 0.09609673917293549,
"num_tokens": 23435170.0,
"reward": 1.1387224197387695,
"reward_std": 0.2929524779319763,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7556166648864746,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.8228855133056641,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.780475914478302,
"adv/mean_abs_reasoning": 0.5842746496200562,
"adv/mean_abs_step_conf": 0.7489468455314636,
"adv/ratio_final_to_reasoning": 1.3358031449521766,
"adv/ratio_step_to_reasoning": 1.2818403913613075,
"adv/std_final_conf": 0.924403965473175,
"adv/std_reasoning": 0.8266467452049255,
"adv/std_step_conf": 0.935892641544342,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.7109784202807458,
"calib/avg_num_step_conf": 4.3359375,
"calib/ece": 0.21908333333333335,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.2708333333333333,
"calib/gap": 0.2959983239053006,
"calib/mean_conf": 0.4466666666666666,
"calib/mu_c": 0.6057657657657657,
"calib/mu_w": 0.30976744186046506,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.9375,
"calib/pce": 0.10162500000000002,
"calib/std_conf": 0.3945024996400178,
"calib/step_conf_rate": 0.9375,
"calib/step_q_c": 0.44976545842217486,
"calib/step_q_c_n": 469.0,
"calib/step_q_gap": 0.1172225567061062,
"calib/step_q_w": 0.33254290171606865,
"calib/step_q_w_n": 641.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2282.0,
"completions/max_terminated_length": 2282.0,
"completions/mean_length": 522.67578125,
"completions/mean_terminated_length": 522.67578125,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.03128324821591377,
"kl": 0.116485595703125,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.104,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.03342951089143753,
"mask/share_reasoning": 0.8713258504867554,
"mask/share_step_conf": 0.09524467587471008,
"num_tokens": 23675967.0,
"reward": 1.076228141784668,
"reward_std": 0.2763334810733795,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.6834926009178162,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.7985799908638,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.6880002021789551,
"adv/mean_abs_reasoning": 0.4698541760444641,
"adv/mean_abs_step_conf": 0.7566457986831665,
"adv/ratio_final_to_reasoning": 1.4642845317902355,
"adv/ratio_step_to_reasoning": 1.610384321904084,
"adv/std_final_conf": 0.8732596039772034,
"adv/std_reasoning": 0.7207974195480347,
"adv/std_step_conf": 0.9353622794151306,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8203735144312395,
"calib/avg_num_step_conf": 4.25390625,
"calib/ece": 0.17494666666666664,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.38,
"calib/gap": 0.43766270514997174,
"calib/mean_conf": 0.5403333333333333,
"calib/mu_c": 0.7066451612903226,
"calib/mu_w": 0.2689824561403509,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.04764,
"calib/std_conf": 0.3995536676509594,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.5064505672609401,
"calib/step_q_c_n": 617.0,
"calib/step_q_gap": 0.14831497404060107,
"calib/step_q_w": 0.358135593220339,
"calib/step_q_w_n": 472.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2993.0,
"completions/max_terminated_length": 2993.0,
"completions/mean_length": 426.96484375,
"completions/mean_terminated_length": 426.96484375,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.1088,
"grad_norm": 0.040686286985874176,
"kl": 0.1273956298828125,
"learning_rate": 2.7222222222222224e-06,
"loss": 0.0351,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.040620945394039154,
"mask/share_reasoning": 0.8511543273925781,
"mask/share_step_conf": 0.10822470486164093,
"num_tokens": 23891966.0,
"reward": 1.1479109525680542,
"reward_std": 0.22687333822250366,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7660393714904785,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8120423555374146,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.7139617204666138,
"adv/mean_abs_reasoning": 0.5544531345367432,
"adv/mean_abs_step_conf": 0.7314602136611938,
"adv/ratio_final_to_reasoning": 1.2876863272910222,
"adv/ratio_step_to_reasoning": 1.3192462412036747,
"adv/std_final_conf": 0.8981577754020691,
"adv/std_reasoning": 0.7929568886756897,
"adv/std_step_conf": 0.9361253976821899,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.7499628749628751,
"calib/avg_num_step_conf": 3.859375,
"calib/ece": 0.19418410041841005,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.28451882845188287,
"calib/gap": 0.31697727947727944,
"calib/mean_conf": 0.5151882845188285,
"calib/mu_c": 0.6358783783783784,
"calib/mu_w": 0.3189010989010989,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.04506276150627617,
"calib/std_conf": 0.37221246534829466,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.5089373848987109,
"calib/step_q_c_n": 543.0,
"calib/step_q_gap": 0.1450946882694974,
"calib/step_q_w": 0.3638426966292135,
"calib/step_q_w_n": 445.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2938.0,
"completions/max_terminated_length": 2938.0,
"completions/mean_length": 536.14453125,
"completions/mean_terminated_length": 536.14453125,
"completions/min_length": 149.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.039745040237903595,
"kl": 0.10501861572265625,
"learning_rate": 2.6944444444444444e-06,
"loss": -0.1255,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.03567434847354889,
"mask/share_reasoning": 0.8767580986022949,
"mask/share_step_conf": 0.08756758272647858,
"num_tokens": 24133771.0,
"reward": 1.09434175491333,
"reward_std": 0.3003833293914795,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7061066031455994,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.7878636121749878,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.7238519787788391,
"adv/mean_abs_reasoning": 0.5018754005432129,
"adv/mean_abs_step_conf": 0.752862811088562,
"adv/ratio_final_to_reasoning": 1.442294198909463,
"adv/ratio_step_to_reasoning": 1.50009904903426,
"adv/std_final_conf": 0.8926047086715698,
"adv/std_reasoning": 0.7394863963127136,
"adv/std_step_conf": 0.9357385635375977,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7445542046605875,
"calib/avg_num_step_conf": 4.21484375,
"calib/ece": 0.16699604743083005,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.2134387351778656,
"calib/gap": 0.32023239614994925,
"calib/mean_conf": 0.37679841897233207,
"calib/mu_c": 0.5552678571428571,
"calib/mu_w": 0.2350354609929078,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.050553359683794465,
"calib/std_conf": 0.36494791831233164,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5176548672566371,
"calib/step_q_c_n": 452.0,
"calib/step_q_gap": 0.14708070457721129,
"calib/step_q_w": 0.37057416267942583,
"calib/step_q_w_n": 627.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2885.0,
"completions/max_terminated_length": 2885.0,
"completions/mean_length": 458.49609375,
"completions/mean_terminated_length": 458.49609375,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.0384674109518528,
"kl": 0.1230621337890625,
"learning_rate": 2.666666666666667e-06,
"loss": -0.0462,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.036535318940877914,
"mask/share_reasoning": 0.8619049787521362,
"mask/share_step_conf": 0.10155968368053436,
"num_tokens": 24357826.0,
"reward": 1.1272393465042114,
"reward_std": 0.19814637303352356,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.7452765703201294,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8186347484588623,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.793289065361023,
"adv/mean_abs_reasoning": 0.68181312084198,
"adv/mean_abs_step_conf": 0.7531487941741943,
"adv/ratio_final_to_reasoning": 1.1634992655779048,
"adv/ratio_step_to_reasoning": 1.104626430838,
"adv/std_final_conf": 0.9366859793663025,
"adv/std_reasoning": 0.8904972672462463,
"adv/std_step_conf": 0.936499834060669,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.7122390360095279,
"calib/avg_num_step_conf": 4.3984375,
"calib/ece": 0.2141422594142259,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.2510460251046025,
"calib/gap": 0.281342300686563,
"calib/mean_conf": 0.4468619246861924,
"calib/mu_c": 0.5845901639344263,
"calib/mu_w": 0.3032478632478633,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.07527196652719664,
"calib/std_conf": 0.3829599891896605,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_c": 0.4785921325051759,
"calib/step_q_c_n": 483.0,
"calib/step_q_gap": 0.10702137045229876,
"calib/step_q_w": 0.37157076205287715,
"calib/step_q_w_n": 643.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2331.0,
"completions/max_terminated_length": 2331.0,
"completions/mean_length": 495.2734375,
"completions/mean_terminated_length": 499.1732177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 90.0,
"epoch": 0.112,
"grad_norm": 0.06704078614711761,
"kl": 0.1043853759765625,
"learning_rate": 2.6388888888888893e-06,
"loss": -0.1896,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.03599028289318085,
"mask/share_reasoning": 0.8575528860092163,
"mask/share_step_conf": 0.09864436089992523,
"num_tokens": 24590376.0,
"reward": 1.0415213108062744,
"reward_std": 0.3739849925041199,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6758534908294678,
"rewards/format_reward_step": 0.90625,
"rewards/step_l2_reward": 0.753751277923584,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.7251718044281006,
"adv/mean_abs_reasoning": 0.4511352479457855,
"adv/mean_abs_step_conf": 0.7519001364707947,
"adv/ratio_final_to_reasoning": 1.6074376979633545,
"adv/ratio_step_to_reasoning": 1.6666845250831588,
"adv/std_final_conf": 0.9015657305717468,
"adv/std_reasoning": 0.7206656336784363,
"adv/std_step_conf": 0.9358862638473511,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7424395161290323,
"calib/avg_num_step_conf": 4.1796875,
"calib/ece": 0.2087301587301587,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.3055555555555556,
"calib/gap": 0.31643145161290326,
"calib/mean_conf": 0.4999206349206349,
"calib/mu_c": 0.655625,
"calib/mu_w": 0.33919354838709675,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.10035714285714281,
"calib/std_conf": 0.38800976921048197,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5021814671814672,
"calib/step_q_c_n": 518.0,
"calib/step_q_gap": 0.11491093578050099,
"calib/step_q_w": 0.3872705314009662,
"calib/step_q_w_n": 552.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2565.0,
"completions/max_terminated_length": 2565.0,
"completions/mean_length": 448.51171875,
"completions/mean_terminated_length": 448.51171875,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.03875832259654999,
"kl": 0.116058349609375,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.0122,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03689251095056534,
"mask/share_reasoning": 0.8643423914909363,
"mask/share_step_conf": 0.09876511245965958,
"num_tokens": 24809779.0,
"reward": 1.1323719024658203,
"reward_std": 0.22136250138282776,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7384449243545532,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.82065749168396,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.7681862115859985,
"adv/mean_abs_reasoning": 0.5777244567871094,
"adv/mean_abs_step_conf": 0.7491806745529175,
"adv/ratio_final_to_reasoning": 1.329675769411081,
"adv/ratio_step_to_reasoning": 1.2967785347349237,
"adv/std_final_conf": 0.9274890422821045,
"adv/std_reasoning": 0.792971134185791,
"adv/std_step_conf": 0.9358506798744202,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7280799475753603,
"calib/avg_num_step_conf": 4.62109375,
"calib/ece": 0.17730923694779113,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.3493975903614458,
"calib/gap": 0.32397575360419384,
"calib/mean_conf": 0.5883935742971887,
"calib/mu_c": 0.7302142857142856,
"calib/mu_w": 0.40623853211009175,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.10172690763052208,
"calib/std_conf": 0.3808053051634628,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.48981260647359454,
"calib/step_q_c_n": 587.0,
"calib/step_q_gap": 0.1215407943930576,
"calib/step_q_w": 0.36827181208053694,
"calib/step_q_w_n": 596.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2529.0,
"completions/max_terminated_length": 2529.0,
"completions/mean_length": 475.2734375,
"completions/mean_terminated_length": 475.2734375,
"completions/min_length": 134.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.07799244672060013,
"kl": 0.11484527587890625,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.0611,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03707215189933777,
"mask/share_reasoning": 0.8559058904647827,
"mask/share_step_conf": 0.10702195018529892,
"num_tokens": 25036065.0,
"reward": 1.1105011701583862,
"reward_std": 0.2526007294654846,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7247980833053589,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.7990319728851318,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.6599736213684082,
"adv/mean_abs_reasoning": 0.5213769674301147,
"adv/mean_abs_step_conf": 0.7592152953147888,
"adv/ratio_final_to_reasoning": 1.2658281101703461,
"adv/ratio_step_to_reasoning": 1.4561734459751599,
"adv/std_final_conf": 0.865111231803894,
"adv/std_reasoning": 0.7577022314071655,
"adv/std_step_conf": 0.9359382390975952,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7108021390374332,
"calib/avg_num_step_conf": 4.62109375,
"calib/ece": 0.2111200000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6,
"calib/gap": 0.2970837789661319,
"calib/mean_conf": 0.73184,
"calib/mu_c": 0.832848484848485,
"calib/mu_w": 0.535764705882353,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.14148000000000013,
"calib/std_conf": 0.36135054227162855,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5201437908496732,
"calib/step_q_c_n": 765.0,
"calib/step_q_gap": 0.07531125496450575,
"calib/step_q_w": 0.44483253588516747,
"calib/step_q_w_n": 418.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1879.0,
"completions/max_terminated_length": 1879.0,
"completions/mean_length": 482.21875,
"completions/mean_terminated_length": 484.1098327636719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.1152,
"grad_norm": 0.05277223140001297,
"kl": 0.1013336181640625,
"learning_rate": 2.5555555555555557e-06,
"loss": -0.065,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.035249825567007065,
"mask/share_reasoning": 0.851841390132904,
"mask/share_step_conf": 0.10900251567363739,
"num_tokens": 25262745.0,
"reward": 1.1391626596450806,
"reward_std": 0.24059459567070007,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7480719089508057,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8061065673828125,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.6619673371315002,
"adv/mean_abs_reasoning": 0.31352001428604126,
"adv/mean_abs_step_conf": 0.7552825212478638,
"adv/ratio_final_to_reasoning": 2.111403760423255,
"adv/ratio_step_to_reasoning": 2.4090408485334485,
"adv/std_final_conf": 0.8693623542785645,
"adv/std_reasoning": 0.6185252070426941,
"adv/std_step_conf": 0.9353784322738647,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8374396748793497,
"calib/avg_num_step_conf": 5.0234375,
"calib/ece": 0.13673306772908367,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.4302788844621514,
"calib/gap": 0.5134213868427737,
"calib/mean_conf": 0.5750597609561753,
"calib/mu_c": 0.8348387096774195,
"calib/mu_w": 0.3214173228346457,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.10888446215139441,
"calib/std_conf": 0.42031516397350965,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5258373983739837,
"calib/step_q_c_n": 615.0,
"calib/step_q_gap": 0.15632920165267222,
"calib/step_q_w": 0.36950819672131147,
"calib/step_q_w_n": 671.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2485.0,
"completions/max_terminated_length": 2485.0,
"completions/mean_length": 492.90625,
"completions/mean_terminated_length": 492.90625,
"completions/min_length": 131.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.031876228749752045,
"kl": 0.102783203125,
"learning_rate": 2.5277777777777778e-06,
"loss": -0.0039,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03490302711725235,
"mask/share_reasoning": 0.8533127307891846,
"mask/share_step_conf": 0.11178424954414368,
"num_tokens": 25493529.0,
"reward": 1.1876109838485718,
"reward_std": 0.2054915726184845,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.7973277568817139,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8581793904304504,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.764315128326416,
"adv/mean_abs_reasoning": 0.5388686656951904,
"adv/mean_abs_step_conf": 0.7830692529678345,
"adv/ratio_final_to_reasoning": 1.4183699609632687,
"adv/ratio_step_to_reasoning": 1.4531727354337864,
"adv/std_final_conf": 0.9091663360595703,
"adv/std_reasoning": 0.7755135893821716,
"adv/std_step_conf": 0.9355852007865906,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6606814600840337,
"calib/avg_num_step_conf": 4.69921875,
"calib/ece": 0.2866396761133603,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.4979757085020243,
"calib/gap": 0.23397058823529415,
"calib/mean_conf": 0.6352226720647772,
"calib/mu_c": 0.7564705882352941,
"calib/mu_w": 0.5225,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.2200404858299595,
"calib/std_conf": 0.40716638440223923,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5408249496981891,
"calib/step_q_c_n": 497.0,
"calib/step_q_gap": 0.11504591287099358,
"calib/step_q_w": 0.4257790368271955,
"calib/step_q_w_n": 706.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1291.0,
"completions/max_terminated_length": 1291.0,
"completions/mean_length": 436.8203125,
"completions/mean_terminated_length": 438.5333557128906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.037987880408763885,
"kl": 0.1119537353515625,
"learning_rate": 2.5e-06,
"loss": 0.0048,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.036573655903339386,
"mask/share_reasoning": 0.8427259922027588,
"mask/share_step_conf": 0.11679408699274063,
"num_tokens": 25710275.0,
"reward": 1.06510329246521,
"reward_std": 0.23794767260551453,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.6501030921936035,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7966315746307373,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.6385260820388794,
"adv/mean_abs_reasoning": 0.5157452821731567,
"adv/mean_abs_step_conf": 0.7536460161209106,
"adv/ratio_final_to_reasoning": 1.2380648046810443,
"adv/ratio_step_to_reasoning": 1.4612756377437515,
"adv/std_final_conf": 0.8398423194885254,
"adv/std_reasoning": 0.7576634883880615,
"adv/std_step_conf": 0.9358636140823364,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7523087293562521,
"calib/avg_num_step_conf": 4.07421875,
"calib/ece": 0.24954918032786902,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.6188524590163934,
"calib/gap": 0.3211425682507584,
"calib/mean_conf": 0.7228278688524591,
"calib/mu_c": 0.8741860465116279,
"calib/mu_w": 0.5530434782608695,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.22184426229508214,
"calib/std_conf": 0.38164521316469274,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5588198757763976,
"calib/step_q_c_n": 483.0,
"calib/step_q_gap": 0.1459805900621119,
"calib/step_q_w": 0.4128392857142857,
"calib/step_q_w_n": 560.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2699.0,
"completions/max_terminated_length": 2699.0,
"completions/mean_length": 460.4375,
"completions/mean_terminated_length": 462.2431640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.1184,
"grad_norm": 0.031583499163389206,
"kl": 0.1053466796875,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.0507,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.037514761090278625,
"mask/share_reasoning": 0.8553412556648254,
"mask/share_step_conf": 0.10323773324489594,
"num_tokens": 25935555.0,
"reward": 1.0907256603240967,
"reward_std": 0.26927220821380615,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6895140409469604,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8008747100830078,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.6715695261955261,
"adv/mean_abs_reasoning": 0.4684767723083496,
"adv/mean_abs_step_conf": 0.7551547288894653,
"adv/ratio_final_to_reasoning": 1.4335172326398748,
"adv/ratio_step_to_reasoning": 1.6119363296680702,
"adv/std_final_conf": 0.8733709454536438,
"adv/std_reasoning": 0.7393901944160461,
"adv/std_step_conf": 0.9356892704963684,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.8344532279314888,
"calib/avg_num_step_conf": 4.625,
"calib/ece": 0.1204048582995951,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.44534412955465585,
"calib/gap": 0.5215131752305664,
"calib/mean_conf": 0.57165991902834,
"calib/mu_c": 0.814469696969697,
"calib/mu_w": 0.2929565217391305,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.07882591093117405,
"calib/std_conf": 0.42045001644516367,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5273089171974522,
"calib/step_q_c_n": 628.0,
"calib/step_q_gap": 0.1731182697154378,
"calib/step_q_w": 0.3541906474820144,
"calib/step_q_w_n": 556.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2929.0,
"completions/max_terminated_length": 2929.0,
"completions/mean_length": 538.16796875,
"completions/mean_terminated_length": 538.16796875,
"completions/min_length": 97.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.028904251754283905,
"kl": 0.097686767578125,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.0036,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03284811973571777,
"mask/share_reasoning": 0.8688977360725403,
"mask/share_step_conf": 0.09825415909290314,
"num_tokens": 26181246.0,
"reward": 1.163731336593628,
"reward_std": 0.22518390417099,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7981777191162109,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8231689929962158,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.6834825277328491,
"adv/mean_abs_reasoning": 0.5273545980453491,
"adv/mean_abs_step_conf": 0.7404484152793884,
"adv/ratio_final_to_reasoning": 1.2960587245587532,
"adv/ratio_step_to_reasoning": 1.404080703996658,
"adv/std_final_conf": 0.8908624053001404,
"adv/std_reasoning": 0.7928634285926819,
"adv/std_step_conf": 0.9354289174079895,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7851299643752476,
"calib/avg_num_step_conf": 4.99609375,
"calib/ece": 0.19566265060240962,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5662650602409639,
"calib/gap": 0.3544748647578836,
"calib/mean_conf": 0.6876305220883534,
"calib/mu_c": 0.8385314685314685,
"calib/mu_w": 0.48405660377358495,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1544979919678715,
"calib/std_conf": 0.38450354998612984,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5137192474674385,
"calib/step_q_c_n": 691.0,
"calib/step_q_gap": 0.12504577807968337,
"calib/step_q_w": 0.3886734693877551,
"calib/step_q_w_n": 588.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2325.0,
"completions/max_terminated_length": 2325.0,
"completions/mean_length": 432.34765625,
"completions/mean_terminated_length": 432.34765625,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.03928934410214424,
"kl": 0.1289215087890625,
"learning_rate": 2.4166666666666667e-06,
"loss": 0.0206,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.037928465753793716,
"mask/share_reasoning": 0.8393813371658325,
"mask/share_step_conf": 0.12269022315740585,
"num_tokens": 26397127.0,
"reward": 1.1599385738372803,
"reward_std": 0.2528593838214874,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7432464361190796,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8474411964416504,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.6147596836090088,
"adv/mean_abs_reasoning": 0.5180281400680542,
"adv/mean_abs_step_conf": 0.7503921389579773,
"adv/ratio_final_to_reasoning": 1.1867302875250112,
"adv/ratio_step_to_reasoning": 1.4485547809418173,
"adv/std_final_conf": 0.8503017425537109,
"adv/std_reasoning": 0.7753562331199646,
"adv/std_step_conf": 0.9356796741485596,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8306440631808281,
"calib/avg_num_step_conf": 5.1171875,
"calib/ece": 0.16803212851405624,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.46171568627450965,
"calib/mean_conf": 0.7612048192771085,
"calib/mu_c": 0.9392156862745097,
"calib/mu_w": 0.47750000000000004,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15738955823293174,
"calib/std_conf": 0.3656541618540561,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5442463958060288,
"calib/step_q_c_n": 763.0,
"calib/step_q_gap": 0.15053524406928293,
"calib/step_q_w": 0.3937111517367459,
"calib/step_q_w_n": 547.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2190.0,
"completions/max_terminated_length": 2190.0,
"completions/mean_length": 438.015625,
"completions/mean_terminated_length": 439.7333679199219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.1216,
"grad_norm": 0.04486739635467529,
"kl": 0.1123199462890625,
"learning_rate": 2.388888888888889e-06,
"loss": -0.0061,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03878115117549896,
"mask/share_reasoning": 0.8267152905464172,
"mask/share_step_conf": 0.130597323179245,
"num_tokens": 26614283.0,
"reward": 1.188596248626709,
"reward_std": 0.24334082007408142,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.8040202856063843,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8394063711166382,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.7535337209701538,
"adv/mean_abs_reasoning": 0.6055867671966553,
"adv/mean_abs_step_conf": 0.7513902187347412,
"adv/ratio_final_to_reasoning": 1.2443034785227647,
"adv/ratio_step_to_reasoning": 1.2407639324964617,
"adv/std_final_conf": 0.9215371608734131,
"adv/std_reasoning": 0.8266196846961975,
"adv/std_step_conf": 0.9360374212265015,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6653112192622951,
"calib/avg_num_step_conf": 4.75390625,
"calib/ece": 0.34132,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.624,
"calib/gap": 0.2081083504098361,
"calib/mean_conf": 0.73524,
"calib/mu_c": 0.8367968750000001,
"calib/mu_w": 0.628688524590164,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.28228,
"calib/std_conf": 0.3860043295093981,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5139455782312926,
"calib/step_q_c_n": 588.0,
"calib/step_q_gap": 0.09846863069552148,
"calib/step_q_w": 0.41547694753577114,
"calib/step_q_w_n": 629.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2754.0,
"completions/max_terminated_length": 2754.0,
"completions/mean_length": 440.21875,
"completions/mean_terminated_length": 440.21875,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.05996016785502434,
"kl": 0.11113739013671875,
"learning_rate": 2.361111111111111e-06,
"loss": 0.0315,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03766099363565445,
"mask/share_reasoning": 0.8441624641418457,
"mask/share_step_conf": 0.11817656457424164,
"num_tokens": 26832243.0,
"reward": 1.0623983144760132,
"reward_std": 0.29575490951538086,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6283395290374756,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8023255467414856,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.7394832372665405,
"adv/mean_abs_reasoning": 0.51995849609375,
"adv/mean_abs_step_conf": 0.7374346256256104,
"adv/ratio_final_to_reasoning": 1.4221966615066322,
"adv/ratio_step_to_reasoning": 1.4182567092675196,
"adv/std_final_conf": 0.9036124348640442,
"adv/std_reasoning": 0.7754974961280823,
"adv/std_step_conf": 0.9353717565536499,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6897636294319481,
"calib/avg_num_step_conf": 5.40234375,
"calib/ece": 0.28478087649402395,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5697211155378487,
"calib/gap": 0.27373236751810914,
"calib/mean_conf": 0.6809561752988048,
"calib/mu_c": 0.8216393442622951,
"calib/mu_w": 0.547906976744186,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.23984063745019926,
"calib/std_conf": 0.4027157337661665,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5136998254799302,
"calib/step_q_c_n": 573.0,
"calib/step_q_gap": 0.11805785017128828,
"calib/step_q_w": 0.3956419753086419,
"calib/step_q_w_n": 810.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2361.0,
"completions/max_terminated_length": 2361.0,
"completions/mean_length": 512.96875,
"completions/mean_terminated_length": 512.96875,
"completions/min_length": 126.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.03793274611234665,
"kl": 0.09812164306640625,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.0113,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034530505537986755,
"mask/share_reasoning": 0.8493834137916565,
"mask/share_step_conf": 0.11608609557151794,
"num_tokens": 27068083.0,
"reward": 1.1046454906463623,
"reward_std": 0.2591022551059723,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6625652313232422,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8389630317687988,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.6700632572174072,
"adv/mean_abs_reasoning": 0.5251960158348083,
"adv/mean_abs_step_conf": 0.7502630949020386,
"adv/ratio_final_to_reasoning": 1.2758346160572636,
"adv/ratio_step_to_reasoning": 1.42853919733089,
"adv/std_final_conf": 0.8910670876502991,
"adv/std_reasoning": 0.757550835609436,
"adv/std_step_conf": 0.9356370568275452,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7566102756892231,
"calib/avg_num_step_conf": 5.21875,
"calib/ece": 0.25251968503936995,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.562992125984252,
"calib/gap": 0.3700125313283208,
"calib/mean_conf": 0.6753543307086615,
"calib/mu_c": 0.879298245614035,
"calib/mu_w": 0.5092857142857142,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.239527559055118,
"calib/std_conf": 0.39802919070650244,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5111211573236889,
"calib/step_q_c_n": 553.0,
"calib/step_q_gap": 0.1128069810784782,
"calib/step_q_w": 0.3983141762452107,
"calib/step_q_w_n": 783.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2087.0,
"completions/max_terminated_length": 2087.0,
"completions/mean_length": 462.67578125,
"completions/mean_terminated_length": 462.67578125,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.1248,
"grad_norm": 0.039365384727716446,
"kl": 0.10528564453125,
"learning_rate": 2.305555555555556e-06,
"loss": 0.0083,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.035405274480581284,
"mask/share_reasoning": 0.8448545932769775,
"mask/share_step_conf": 0.11974013596773148,
"num_tokens": 27293128.0,
"reward": 1.1315536499023438,
"reward_std": 0.21968162059783936,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.7202702760696411,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8368913531303406,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.586737871170044,
"adv/mean_abs_reasoning": 0.4335465431213379,
"adv/mean_abs_step_conf": 0.7783301472663879,
"adv/ratio_final_to_reasoning": 1.3533445958207813,
"adv/ratio_step_to_reasoning": 1.7952631836544353,
"adv/std_final_conf": 0.8127620220184326,
"adv/std_reasoning": 0.701391875743866,
"adv/std_step_conf": 0.9351849555969238,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7986656200941915,
"calib/avg_num_step_conf": 5.96875,
"calib/ece": 0.21003984063745013,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6454183266932271,
"calib/gap": 0.37197082679225535,
"calib/mean_conf": 0.7494820717131474,
"calib/mu_c": 0.9036054421768707,
"calib/mu_w": 0.5316346153846153,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18693227091633458,
"calib/std_conf": 0.3743341108835886,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4696088019559902,
"calib/step_q_c_n": 818.0,
"calib/step_q_gap": 0.10428485829401835,
"calib/step_q_w": 0.36532394366197185,
"calib/step_q_w_n": 710.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1674.0,
"completions/max_terminated_length": 1674.0,
"completions/mean_length": 494.5078125,
"completions/mean_terminated_length": 494.5078125,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.03469162434339523,
"kl": 0.097198486328125,
"learning_rate": 2.277777777777778e-06,
"loss": 0.0425,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03486177325248718,
"mask/share_reasoning": 0.8360300660133362,
"mask/share_step_conf": 0.12910816073417664,
"num_tokens": 27523730.0,
"reward": 1.1617202758789062,
"reward_std": 0.2108428180217743,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7519367337226868,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8408982753753662,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.6262631416320801,
"adv/mean_abs_reasoning": 0.5967061519622803,
"adv/mean_abs_step_conf": 0.7638027667999268,
"adv/ratio_final_to_reasoning": 1.04953357623782,
"adv/ratio_step_to_reasoning": 1.280031660957652,
"adv/std_final_conf": 0.861553966999054,
"adv/std_reasoning": 0.8265232443809509,
"adv/std_step_conf": 0.9356296062469482,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7408450704225351,
"calib/avg_num_step_conf": 4.8984375,
"calib/ece": 0.2215873015873016,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.49603174603174605,
"calib/gap": 0.3755697823303458,
"calib/mean_conf": 0.6175396825396826,
"calib/mu_c": 0.7814788732394367,
"calib/mu_w": 0.40590909090909083,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.13781746031746034,
"calib/std_conf": 0.4239210141088477,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5052832861189802,
"calib/step_q_c_n": 706.0,
"calib/step_q_gap": 0.11292927152044008,
"calib/step_q_w": 0.39235401459854014,
"calib/step_q_w_n": 548.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2863.0,
"completions/max_terminated_length": 2863.0,
"completions/mean_length": 525.4453125,
"completions/mean_terminated_length": 525.4453125,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.03453721106052399,
"kl": 0.0975494384765625,
"learning_rate": 2.25e-06,
"loss": -0.0452,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.033460669219493866,
"mask/share_reasoning": 0.8612064123153687,
"mask/share_step_conf": 0.10533291101455688,
"num_tokens": 27763308.0,
"reward": 1.1733651161193848,
"reward_std": 0.208999365568161,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7443429231643677,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8630497455596924,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.6886662244796753,
"adv/mean_abs_reasoning": 0.5544482469558716,
"adv/mean_abs_step_conf": 0.7756971120834351,
"adv/ratio_final_to_reasoning": 1.242074852361263,
"adv/ratio_step_to_reasoning": 1.3990433125946427,
"adv/std_final_conf": 0.8938544392585754,
"adv/std_reasoning": 0.8264629244804382,
"adv/std_step_conf": 0.9358031153678894,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7579295921924015,
"calib/avg_num_step_conf": 5.09375,
"calib/ece": 0.21085365853658533,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.5528455284552846,
"calib/gap": 0.42188846287905185,
"calib/mean_conf": 0.6501219512195122,
"calib/mu_c": 0.813046357615894,
"calib/mu_w": 0.3911578947368422,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.12357723577235769,
"calib/std_conf": 0.42475864185097756,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.47984756898817343,
"calib/step_q_c_n": 761.0,
"calib/step_q_gap": 0.08424904228467434,
"calib/step_q_w": 0.3955985267034991,
"calib/step_q_w_n": 543.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2722.0,
"completions/max_terminated_length": 2722.0,
"completions/mean_length": 466.35546875,
"completions/mean_terminated_length": 466.35546875,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.128,
"grad_norm": 0.051741085946559906,
"kl": 0.106475830078125,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0191,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03581401705741882,
"mask/share_reasoning": 0.8447315692901611,
"mask/share_step_conf": 0.11945446580648422,
"num_tokens": 27989383.0,
"reward": 1.1456115245819092,
"reward_std": 0.2601540982723236,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7468265295028687,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8228267431259155,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.7501630187034607,
"adv/mean_abs_reasoning": 0.6250473260879517,
"adv/mean_abs_step_conf": 0.7540261149406433,
"adv/ratio_final_to_reasoning": 1.2001699509676869,
"adv/ratio_step_to_reasoning": 1.206350436950022,
"adv/std_final_conf": 0.9055350422859192,
"adv/std_reasoning": 0.8267970085144043,
"adv/std_step_conf": 0.9355260729789734,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7311981413873216,
"calib/avg_num_step_conf": 5.9453125,
"calib/ece": 0.24650406504065042,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.4959349593495935,
"calib/gap": 0.28248390308662463,
"calib/mean_conf": 0.6463414634146342,
"calib/mu_c": 0.7783969465648855,
"calib/mu_w": 0.4959130434782608,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.18016260162601627,
"calib/std_conf": 0.3992498418888415,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4725547445255474,
"calib/step_q_c_n": 685.0,
"calib/step_q_gap": 0.10243527021252469,
"calib/step_q_w": 0.3701194743130227,
"calib/step_q_w_n": 837.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2825.0,
"completions/max_terminated_length": 2825.0,
"completions/mean_length": 543.90234375,
"completions/mean_terminated_length": 548.18505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.031234215945005417,
"kl": 0.09563446044921875,
"learning_rate": 2.1944444444444445e-06,
"loss": -0.0565,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03101031668484211,
"mask/share_reasoning": 0.849081814289093,
"mask/share_step_conf": 0.11209535598754883,
"num_tokens": 28233678.0,
"reward": 1.1141672134399414,
"reward_std": 0.26280272006988525,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6912468671798706,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8283709287643433,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.6409776210784912,
"adv/mean_abs_reasoning": 0.5207710266113281,
"adv/mean_abs_step_conf": 0.7637045383453369,
"adv/ratio_final_to_reasoning": 1.2308242746324634,
"adv/ratio_step_to_reasoning": 1.4664881479962202,
"adv/std_final_conf": 0.8490242958068848,
"adv/std_reasoning": 0.7752876877784729,
"adv/std_step_conf": 0.9349822998046875,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7824970752632264,
"calib/avg_num_step_conf": 5.00390625,
"calib/ece": 0.1734509803921569,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5803921568627451,
"calib/gap": 0.439220720135188,
"calib/mean_conf": 0.682156862745098,
"calib/mu_c": 0.850955414012739,
"calib/mu_w": 0.411734693877551,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.11996078431372553,
"calib/std_conf": 0.407408422167366,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5227373612823675,
"calib/step_q_c_n": 811.0,
"calib/step_q_gap": 0.13122672298449511,
"calib/step_q_w": 0.39151063829787236,
"calib/step_q_w_n": 470.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1929.0,
"completions/max_terminated_length": 1929.0,
"completions/mean_length": 471.87109375,
"completions/mean_terminated_length": 473.7215881347656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.031476348638534546,
"kl": 0.1068267822265625,
"learning_rate": 2.166666666666667e-06,
"loss": -0.0158,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.0348004549741745,
"mask/share_reasoning": 0.8466958999633789,
"mask/share_step_conf": 0.11459743976593018,
"num_tokens": 28461821.0,
"reward": 1.2163907289505005,
"reward_std": 0.19097883999347687,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7977089881896973,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.875464916229248,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.7445669770240784,
"adv/mean_abs_reasoning": 0.5697649717330933,
"adv/mean_abs_step_conf": 0.7346761226654053,
"adv/ratio_final_to_reasoning": 1.3067966862885196,
"adv/ratio_step_to_reasoning": 1.2894371523589638,
"adv/std_final_conf": 0.9072105884552002,
"adv/std_reasoning": 0.8099097609519958,
"adv/std_step_conf": 0.9359018206596375,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6791389680278569,
"calib/avg_num_step_conf": 5.90234375,
"calib/ece": 0.26869047619047615,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.49206349206349204,
"calib/gap": 0.2841538461538463,
"calib/mean_conf": 0.6205158730158731,
"calib/mu_c": 0.7524444444444446,
"calib/mu_w": 0.46829059829059827,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17674603174603168,
"calib/std_conf": 0.4183273236928797,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.46036601307189545,
"calib/step_q_c_n": 765.0,
"calib/step_q_gap": 0.07938746079307502,
"calib/step_q_w": 0.38097855227882044,
"calib/step_q_w_n": 746.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1896.0,
"completions/max_terminated_length": 1896.0,
"completions/mean_length": 556.60546875,
"completions/mean_terminated_length": 558.7882690429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.1312,
"grad_norm": 0.033415503799915314,
"kl": 0.0966949462890625,
"learning_rate": 2.138888888888889e-06,
"loss": -0.0082,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03071346879005432,
"mask/share_reasoning": 0.8498426675796509,
"mask/share_step_conf": 0.115537628531456,
"num_tokens": 28709600.0,
"reward": 1.1285775899887085,
"reward_std": 0.25766658782958984,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.699337899684906,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8369823694229126,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.6646140813827515,
"adv/mean_abs_reasoning": 0.45616310834884644,
"adv/mean_abs_step_conf": 0.7651708126068115,
"adv/ratio_final_to_reasoning": 1.4569658729931623,
"adv/ratio_step_to_reasoning": 1.67740617029831,
"adv/std_final_conf": 0.8431295156478882,
"adv/std_reasoning": 0.7392640113830566,
"adv/std_step_conf": 0.9358372092247009,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7841423948220064,
"calib/avg_num_step_conf": 5.1328125,
"calib/ece": 0.1950592885375494,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.541501976284585,
"calib/gap": 0.3796711974110033,
"calib/mean_conf": 0.6663636363636364,
"calib/mu_c": 0.8209333333333334,
"calib/mu_w": 0.4412621359223301,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1342687747035573,
"calib/std_conf": 0.3990317061978957,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5136621621621622,
"calib/step_q_c_n": 740.0,
"calib/step_q_gap": 0.13355763254543745,
"calib/step_q_w": 0.38010452961672475,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2241.0,
"completions/max_terminated_length": 2241.0,
"completions/mean_length": 480.98046875,
"completions/mean_terminated_length": 480.98046875,
"completions/min_length": 118.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.038355790078639984,
"kl": 0.1045684814453125,
"learning_rate": 2.1111111111111114e-06,
"loss": 0.0398,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03441513329744339,
"mask/share_reasoning": 0.8505345582962036,
"mask/share_step_conf": 0.1150503158569336,
"num_tokens": 28939547.0,
"reward": 1.1643469333648682,
"reward_std": 0.22691610455513,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7644277215003967,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8334689140319824,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.742373526096344,
"adv/mean_abs_reasoning": 0.47091740369796753,
"adv/mean_abs_step_conf": 0.7559845447540283,
"adv/ratio_final_to_reasoning": 1.5764410494636982,
"adv/ratio_step_to_reasoning": 1.60534424682018,
"adv/std_final_conf": 0.8906104564666748,
"adv/std_reasoning": 0.7393792271614075,
"adv/std_step_conf": 0.935725748538971,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6311610772357724,
"calib/avg_num_step_conf": 5.30859375,
"calib/ece": 0.3185258964143427,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.47808764940239046,
"calib/gap": 0.1802997967479676,
"calib/mean_conf": 0.6022709163346613,
"calib/mu_c": 0.690625,
"calib/mu_w": 0.5103252032520325,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.20541832669322713,
"calib/std_conf": 0.4166753519409285,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.47332365747460087,
"calib/step_q_c_n": 689.0,
"calib/step_q_gap": 0.07772664254922773,
"calib/step_q_w": 0.39559701492537314,
"calib/step_q_w_n": 670.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2833.0,
"completions/max_terminated_length": 2833.0,
"completions/mean_length": 510.54296875,
"completions/mean_terminated_length": 510.54296875,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.030706195160746574,
"kl": 0.096435546875,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.0279,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034426264464855194,
"mask/share_reasoning": 0.8521647453308105,
"mask/share_step_conf": 0.11340896785259247,
"num_tokens": 29175054.0,
"reward": 1.0779061317443848,
"reward_std": 0.24169524013996124,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6451429724693298,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8107587695121765,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.5916764140129089,
"adv/mean_abs_reasoning": 0.5020506381988525,
"adv/mean_abs_step_conf": 0.7531881928443909,
"adv/ratio_final_to_reasoning": 1.1785193942499428,
"adv/ratio_step_to_reasoning": 1.5002235542345186,
"adv/std_final_conf": 0.7995308637619019,
"adv/std_reasoning": 0.7754698395729065,
"adv/std_step_conf": 0.9352940320968628,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7731808731808733,
"calib/avg_num_step_conf": 6.17578125,
"calib/ece": 0.2134439834024897,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.5352697095435685,
"calib/gap": 0.4235973665973665,
"calib/mean_conf": 0.6165145228215767,
"calib/mu_c": 0.8116153846153845,
"calib/mu_w": 0.388018018018018,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.14526970954356855,
"calib/std_conf": 0.4341896262370006,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5084081041968163,
"calib/step_q_c_n": 691.0,
"calib/step_q_gap": 0.18336316037659156,
"calib/step_q_w": 0.3250449438202247,
"calib/step_q_w_n": 890.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2599.0,
"completions/max_terminated_length": 2599.0,
"completions/mean_length": 515.25,
"completions/mean_terminated_length": 521.3596801757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.1344,
"grad_norm": 0.032088715583086014,
"kl": 0.094940185546875,
"learning_rate": 2.0555555555555555e-06,
"loss": -0.0959,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.03380977362394333,
"mask/share_reasoning": 0.8281418085098267,
"mask/share_step_conf": 0.1263296753168106,
"num_tokens": 29412422.0,
"reward": 1.1038548946380615,
"reward_std": 0.2703258693218231,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7186906337738037,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.7999711632728577,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.6639594435691833,
"adv/mean_abs_reasoning": 0.42932218313217163,
"adv/mean_abs_step_conf": 0.7333925366401672,
"adv/ratio_final_to_reasoning": 1.5465295520608493,
"adv/ratio_step_to_reasoning": 1.7082567951406884,
"adv/std_final_conf": 0.8485727906227112,
"adv/std_reasoning": 0.7205451726913452,
"adv/std_step_conf": 0.9357307553291321,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.740875,
"calib/avg_num_step_conf": 5.71484375,
"calib/ece": 0.26549407114624507,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4743083003952569,
"calib/gap": 0.327150625,
"calib/mean_conf": 0.5762450592885376,
"calib/mu_c": 0.74176,
"calib/mu_w": 0.414609375,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17383399209486164,
"calib/std_conf": 0.4357109169931961,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.47575716234652116,
"calib/step_q_c_n": 733.0,
"calib/step_q_gap": 0.09505853220953492,
"calib/step_q_w": 0.38069863013698624,
"calib/step_q_w_n": 730.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1399.0,
"completions/max_terminated_length": 1399.0,
"completions/mean_length": 460.34375,
"completions/mean_terminated_length": 462.1490478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.0348670557141304,
"kl": 0.1072998046875,
"learning_rate": 2.027777777777778e-06,
"loss": -0.061,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.0351860374212265,
"mask/share_reasoning": 0.8259862661361694,
"mask/share_step_conf": 0.13492143154144287,
"num_tokens": 29633942.0,
"reward": 1.1170024871826172,
"reward_std": 0.22174401581287384,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7007777094841003,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8263181447982788,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.7019212245941162,
"adv/mean_abs_reasoning": 0.5486740469932556,
"adv/mean_abs_step_conf": 0.7690510749816895,
"adv/ratio_final_to_reasoning": 1.2793045861029113,
"adv/ratio_step_to_reasoning": 1.401653821966073,
"adv/std_final_conf": 0.8775027990341187,
"adv/std_reasoning": 0.7754642963409424,
"adv/std_step_conf": 0.9357042908668518,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.7367886178861789,
"calib/avg_num_step_conf": 4.9921875,
"calib/ece": 0.29071129707112975,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.5439330543933054,
"calib/gap": 0.3590573310905523,
"calib/mean_conf": 0.6215062761506277,
"calib/mu_c": 0.8062931034482759,
"calib/mu_w": 0.44723577235772355,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.21343096234309628,
"calib/std_conf": 0.43609650068580086,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5024716981132075,
"calib/step_q_c_n": 530.0,
"calib/step_q_gap": 0.1295572596105336,
"calib/step_q_w": 0.37291443850267386,
"calib/step_q_w_n": 748.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2941.0,
"completions/max_terminated_length": 2941.0,
"completions/mean_length": 541.0703125,
"completions/mean_terminated_length": 543.1921997070312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.12452156841754913,
"kl": 0.4658660888671875,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0899,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.035640161484479904,
"mask/share_reasoning": 0.8514869809150696,
"mask/share_step_conf": 0.10896661132574081,
"num_tokens": 29879120.0,
"reward": 1.0647579431533813,
"reward_std": 0.2794821560382843,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.6730015873908997,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.7866345643997192,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.6740026473999023,
"adv/mean_abs_reasoning": 0.49119341373443604,
"adv/mean_abs_step_conf": 0.758554220199585,
"adv/ratio_final_to_reasoning": 1.372173625610343,
"adv/ratio_step_to_reasoning": 1.544308614467086,
"adv/std_final_conf": 0.8902783393859863,
"adv/std_reasoning": 0.775271475315094,
"adv/std_step_conf": 0.9356626868247986,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6853947368421052,
"calib/avg_num_step_conf": 5.78515625,
"calib/ece": 0.2503571428571427,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5436507936507936,
"calib/gap": 0.2583078947368421,
"calib/mean_conf": 0.709404761904762,
"calib/mu_c": 0.8119078947368421,
"calib/mu_w": 0.5536,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.17829365079365067,
"calib/std_conf": 0.3797921879252788,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4949701314217443,
"calib/step_q_c_n": 837.0,
"calib/step_q_gap": 0.06855708794348342,
"calib/step_q_w": 0.42641304347826087,
"calib/step_q_w_n": 644.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2260.0,
"completions/max_terminated_length": 2260.0,
"completions/mean_length": 464.4765625,
"completions/mean_terminated_length": 464.4765625,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.1376,
"grad_norm": 0.03653028607368469,
"kl": 0.10650634765625,
"learning_rate": 1.9722222222222224e-06,
"loss": -0.0337,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03688127547502518,
"mask/share_reasoning": 0.8289004564285278,
"mask/share_step_conf": 0.1342182755470276,
"num_tokens": 30100410.0,
"reward": 1.1344451904296875,
"reward_std": 0.21902605891227722,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7134796977043152,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.827044665813446,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.6065406799316406,
"adv/mean_abs_reasoning": 0.4211353659629822,
"adv/mean_abs_step_conf": 0.7387837767601013,
"adv/ratio_final_to_reasoning": 1.4402511139018317,
"adv/ratio_step_to_reasoning": 1.7542667666268628,
"adv/std_final_conf": 0.8230589628219604,
"adv/std_reasoning": 0.7014061212539673,
"adv/std_step_conf": 0.9356253147125244,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7767603730233814,
"calib/avg_num_step_conf": 5.05078125,
"calib/ece": 0.19618473895582333,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.606425702811245,
"calib/gap": 0.42343357210433835,
"calib/mean_conf": 0.6834136546184739,
"calib/mu_c": 0.8500662251655628,
"calib/mu_w": 0.4266326530612245,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.13658634538152614,
"calib/std_conf": 0.41361564911772564,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5404630969609262,
"calib/step_q_c_n": 691.0,
"calib/step_q_gap": 0.1171740604160757,
"calib/step_q_w": 0.4232890365448505,
"calib/step_q_w_n": 602.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1970.0,
"completions/max_terminated_length": 1970.0,
"completions/mean_length": 455.4921875,
"completions/mean_terminated_length": 455.4921875,
"completions/min_length": 102.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.036019667983055115,
"kl": 0.108154296875,
"learning_rate": 1.944444444444445e-06,
"loss": 0.0524,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03684893622994423,
"mask/share_reasoning": 0.8434115648269653,
"mask/share_step_conf": 0.11973953992128372,
"num_tokens": 30322304.0,
"reward": 1.1682312488555908,
"reward_std": 0.23199787735939026,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7613061666488647,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8422915935516357,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.7187561392784119,
"adv/mean_abs_reasoning": 0.4014168381690979,
"adv/mean_abs_step_conf": 0.7524584531784058,
"adv/ratio_final_to_reasoning": 1.7905480561222347,
"adv/ratio_step_to_reasoning": 1.8745064522216945,
"adv/std_final_conf": 0.8986456394195557,
"adv/std_reasoning": 0.7012752294540405,
"adv/std_step_conf": 0.9357667565345764,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7624935765673175,
"calib/avg_num_step_conf": 5.48828125,
"calib/ece": 0.20059760956175296,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.3904382470119522,
"calib/gap": 0.38298111510791355,
"calib/mean_conf": 0.5310358565737051,
"calib/mu_c": 0.7431249999999999,
"calib/mu_w": 0.3601438848920864,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14270916334661352,
"calib/std_conf": 0.4212839919835041,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.500577849117175,
"calib/step_q_c_n": 623.0,
"calib/step_q_gap": 0.11279012533200872,
"calib/step_q_w": 0.38778772378516624,
"calib/step_q_w_n": 782.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1942.0,
"completions/max_terminated_length": 1942.0,
"completions/mean_length": 460.75390625,
"completions/mean_terminated_length": 464.38189697265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.033635661005973816,
"kl": 0.1043853759765625,
"learning_rate": 1.916666666666667e-06,
"loss": -0.085,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03563493862748146,
"mask/share_reasoning": 0.8335763216018677,
"mask/share_step_conf": 0.12297628074884415,
"num_tokens": 30546465.0,
"reward": 1.1304597854614258,
"reward_std": 0.22982972860336304,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.7387917637825012,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8262102603912354,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.6320432424545288,
"adv/mean_abs_reasoning": 0.5479155778884888,
"adv/mean_abs_step_conf": 0.7798007726669312,
"adv/ratio_final_to_reasoning": 1.153541289864844,
"adv/ratio_step_to_reasoning": 1.4232133637668456,
"adv/std_final_conf": 0.8763335943222046,
"adv/std_reasoning": 0.792772114276886,
"adv/std_step_conf": 0.9349549412727356,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8292570978952201,
"calib/avg_num_step_conf": 5.4140625,
"calib/ece": 0.1606746031746032,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4880952380952381,
"calib/gap": 0.46420890433414214,
"calib/mean_conf": 0.6292460317460317,
"calib/mu_c": 0.8152980132450331,
"calib/mu_w": 0.351089108910891,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.09535714285714289,
"calib/std_conf": 0.41082648379314624,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5584105960264901,
"calib/step_q_c_n": 755.0,
"calib/step_q_gap": 0.16641376559859788,
"calib/step_q_w": 0.39199683042789224,
"calib/step_q_w_n": 631.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1271.0,
"completions/max_terminated_length": 1271.0,
"completions/mean_length": 463.515625,
"completions/mean_terminated_length": 465.3333740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.1408,
"grad_norm": 0.03424832969903946,
"kl": 0.10589599609375,
"learning_rate": 1.888888888888889e-06,
"loss": -0.0352,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03672264143824577,
"mask/share_reasoning": 0.8319817781448364,
"mask/share_step_conf": 0.1273893415927887,
"num_tokens": 30770717.0,
"reward": 1.1921403408050537,
"reward_std": 0.20535215735435486,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7974933981895447,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8479623198509216,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.7448168992996216,
"adv/mean_abs_reasoning": 0.5736385583877563,
"adv/mean_abs_step_conf": 0.7350783348083496,
"adv/ratio_final_to_reasoning": 1.2984080104255398,
"adv/ratio_step_to_reasoning": 1.2814311800697793,
"adv/std_final_conf": 0.8978567719459534,
"adv/std_reasoning": 0.8099358677864075,
"adv/std_step_conf": 0.9359154105186462,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7202092871157619,
"calib/avg_num_step_conf": 6.3359375,
"calib/ece": 0.2692369477911647,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.3614457831325301,
"calib/gap": 0.320380640941792,
"calib/mean_conf": 0.4864257028112449,
"calib/mu_c": 0.6652727272727272,
"calib/mu_w": 0.3448920863309352,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15694779116465862,
"calib/std_conf": 0.42975999127701225,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.46619533527696794,
"calib/step_q_c_n": 686.0,
"calib/step_q_gap": 0.09176157459320722,
"calib/step_q_w": 0.3744337606837607,
"calib/step_q_w_n": 936.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2130.0,
"completions/max_terminated_length": 2130.0,
"completions/mean_length": 569.0,
"completions/mean_terminated_length": 573.4802856445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.04488792642951012,
"kl": 0.08905792236328125,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.1102,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.030352286994457245,
"mask/share_reasoning": 0.8483153581619263,
"mask/share_step_conf": 0.11351984739303589,
"num_tokens": 31022725.0,
"reward": 1.120848298072815,
"reward_std": 0.2642374038696289,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.7049039006233215,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8370283842086792,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.7327480912208557,
"adv/mean_abs_reasoning": 0.5926192998886108,
"adv/mean_abs_step_conf": 0.7497801780700684,
"adv/ratio_final_to_reasoning": 1.2364566786106757,
"adv/ratio_step_to_reasoning": 1.265197029882418,
"adv/std_final_conf": 0.9129754900932312,
"adv/std_reasoning": 0.8265247941017151,
"adv/std_step_conf": 0.9356361031532288,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.737291449426486,
"calib/avg_num_step_conf": 5.16015625,
"calib/ece": 0.20534136546184734,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.3092369477911647,
"calib/gap": 0.3447536496350365,
"calib/mean_conf": 0.461566265060241,
"calib/mu_c": 0.65125,
"calib/mu_w": 0.3064963503649635,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.10855421686746983,
"calib/std_conf": 0.4173645039328073,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5047840531561462,
"calib/step_q_c_n": 602.0,
"calib/step_q_gap": 0.11189114634112529,
"calib/step_q_w": 0.39289290681502087,
"calib/step_q_w_n": 719.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1945.0,
"completions/max_terminated_length": 1945.0,
"completions/mean_length": 525.98046875,
"completions/mean_terminated_length": 528.0431518554688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.030651414766907692,
"kl": 0.0981597900390625,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.0658,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.031976088881492615,
"mask/share_reasoning": 0.8612660765647888,
"mask/share_step_conf": 0.10285161435604095,
"num_tokens": 31266328.0,
"reward": 1.1207489967346191,
"reward_std": 0.22252947092056274,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.7283519506454468,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8207431435585022,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.7493464946746826,
"adv/mean_abs_reasoning": 0.5230491161346436,
"adv/mean_abs_step_conf": 0.7491369843482971,
"adv/ratio_final_to_reasoning": 1.4326503411618146,
"adv/ratio_step_to_reasoning": 1.432249785420637,
"adv/std_final_conf": 0.913597822189331,
"adv/std_reasoning": 0.7754340767860413,
"adv/std_step_conf": 0.9354913830757141,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6451528952504879,
"calib/avg_num_step_conf": 5.64453125,
"calib/ece": 0.2869588313413015,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.41434262948207173,
"calib/gap": 0.2268954673606593,
"calib/mean_conf": 0.5647543160690571,
"calib/mu_c": 0.6605747126436782,
"calib/mu_w": 0.4336792452830189,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.13701195219123513,
"calib/std_conf": 0.41709440920609037,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4684268125854993,
"calib/step_q_c_n": 731.0,
"calib/step_q_gap": 0.06258647645104554,
"calib/step_q_w": 0.40584033613445375,
"calib/step_q_w_n": 714.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2558.0,
"completions/max_terminated_length": 2558.0,
"completions/mean_length": 508.18359375,
"completions/mean_terminated_length": 508.18359375,
"completions/min_length": 88.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.144,
"grad_norm": 0.0449938029050827,
"kl": 0.094482421875,
"learning_rate": 1.8055555555555557e-06,
"loss": 0.0734,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03592613339424133,
"mask/share_reasoning": 0.8439304828643799,
"mask/share_step_conf": 0.12014340609312057,
"num_tokens": 31502303.0,
"reward": 1.1263656616210938,
"reward_std": 0.23089656233787537,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6755272150039673,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8462612628936768,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.6829756498336792,
"adv/mean_abs_reasoning": 0.4903065860271454,
"adv/mean_abs_step_conf": 0.7625067234039307,
"adv/ratio_final_to_reasoning": 1.3929563038663055,
"adv/ratio_step_to_reasoning": 1.5551631267741428,
"adv/std_final_conf": 0.8802697062492371,
"adv/std_reasoning": 0.7576611638069153,
"adv/std_step_conf": 0.9353592991828918,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.8413996292372881,
"calib/avg_num_step_conf": 5.6875,
"calib/ece": 0.16715447154471547,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.3130081300813008,
"calib/gap": 0.46579978813559336,
"calib/mean_conf": 0.45186991869918697,
"calib/mu_c": 0.6942372881355934,
"calib/mu_w": 0.22843750000000002,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.06967479674796749,
"calib/std_conf": 0.4207443067906461,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.47006163328197237,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.1524903817330257,
"calib/step_q_w": 0.31757125154894666,
"calib/step_q_w_n": 807.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2723.0,
"completions/max_terminated_length": 2723.0,
"completions/mean_length": 495.5,
"completions/mean_terminated_length": 499.4015808105469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.029826823621988297,
"kl": 0.10265350341796875,
"learning_rate": 1.777777777777778e-06,
"loss": 0.0001,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03438597545027733,
"mask/share_reasoning": 0.83380126953125,
"mask/share_step_conf": 0.12400020658969879,
"num_tokens": 31737639.0,
"reward": 1.1563998460769653,
"reward_std": 0.21687453985214233,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.7736788988113403,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8364971876144409,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.6984125375747681,
"adv/mean_abs_reasoning": 0.4689386487007141,
"adv/mean_abs_step_conf": 0.7421602606773376,
"adv/ratio_final_to_reasoning": 1.4893473581455827,
"adv/ratio_step_to_reasoning": 1.5826382891101793,
"adv/std_final_conf": 0.8773199319839478,
"adv/std_reasoning": 0.7392724752426147,
"adv/std_step_conf": 0.9357515573501587,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7205172194688017,
"calib/avg_num_step_conf": 5.60546875,
"calib/ece": 0.2425498007968127,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.3745019920318725,
"calib/gap": 0.3167702376413776,
"calib/mean_conf": 0.5148207171314741,
"calib/mu_c": 0.6776229508196722,
"calib/mu_w": 0.36085271317829454,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13565737051792826,
"calib/std_conf": 0.421878114099864,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4733026113671275,
"calib/step_q_c_n": 651.0,
"calib/step_q_gap": 0.09498628483651522,
"calib/step_q_w": 0.37831632653061226,
"calib/step_q_w_n": 784.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2015.0,
"completions/max_terminated_length": 2015.0,
"completions/mean_length": 468.09375,
"completions/mean_terminated_length": 469.929443359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.04951392114162445,
"kl": 0.116912841796875,
"learning_rate": 1.75e-06,
"loss": 0.0458,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.035196349024772644,
"mask/share_reasoning": 0.8351722955703735,
"mask/share_step_conf": 0.1257251352071762,
"num_tokens": 31964455.0,
"reward": 1.132115364074707,
"reward_std": 0.2261495590209961,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.7153968811035156,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8382851481437683,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.7322195172309875,
"adv/mean_abs_reasoning": 0.5432929992675781,
"adv/mean_abs_step_conf": 0.7652676701545715,
"adv/ratio_final_to_reasoning": 1.3477433322684154,
"adv/ratio_step_to_reasoning": 1.408572669234172,
"adv/std_final_conf": 0.910886287689209,
"adv/std_reasoning": 0.7755010724067688,
"adv/std_step_conf": 0.9350913763046265,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7641581279423982,
"calib/avg_num_step_conf": 5.34765625,
"calib/ece": 0.21911646586345376,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.37349397590361444,
"calib/gap": 0.4090729714760454,
"calib/mean_conf": 0.5021686746987952,
"calib/mu_c": 0.653312101910828,
"calib/mu_w": 0.24423913043478263,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.045381526104417626,
"calib/std_conf": 0.4253665035142027,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4698145859085291,
"calib/step_q_c_n": 809.0,
"calib/step_q_gap": 0.10417172876567199,
"calib/step_q_w": 0.3656428571428571,
"calib/step_q_w_n": 560.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2259.0,
"completions/max_terminated_length": 2259.0,
"completions/mean_length": 486.3515625,
"completions/mean_terminated_length": 486.3515625,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.1472,
"grad_norm": 0.06074017286300659,
"kl": 0.10494232177734375,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.0467,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.036103568971157074,
"mask/share_reasoning": 0.8429389595985413,
"mask/share_step_conf": 0.12095746397972107,
"num_tokens": 32193297.0,
"reward": 1.163382649421692,
"reward_std": 0.20006373524665833,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7394359111785889,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8467612266540527,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.6884328722953796,
"adv/mean_abs_reasoning": 0.514777660369873,
"adv/mean_abs_step_conf": 0.782334566116333,
"adv/ratio_final_to_reasoning": 1.3373402252940298,
"adv/ratio_step_to_reasoning": 1.5197523636791417,
"adv/std_final_conf": 0.8758470416069031,
"adv/std_reasoning": 0.7753527164459229,
"adv/std_step_conf": 0.9349035620689392,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7622080679405521,
"calib/avg_num_step_conf": 5.296875,
"calib/ece": 0.22660079051383397,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.31620553359683795,
"calib/gap": 0.3659156050955414,
"calib/mean_conf": 0.4933201581027668,
"calib/mu_c": 0.6321656050955414,
"calib/mu_w": 0.26625000000000004,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.049683794466403156,
"calib/std_conf": 0.4093251510925892,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.49825699745547075,
"calib/step_q_c_n": 786.0,
"calib/step_q_gap": 0.10916927815722516,
"calib/step_q_w": 0.3890877192982456,
"calib/step_q_w_n": 570.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2838.0,
"completions/max_terminated_length": 2838.0,
"completions/mean_length": 449.41796875,
"completions/mean_terminated_length": 449.41796875,
"completions/min_length": 119.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.09828542172908783,
"kl": 0.111328125,
"learning_rate": 1.6944444444444446e-06,
"loss": 0.0748,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.037707164883613586,
"mask/share_reasoning": 0.8396942615509033,
"mask/share_step_conf": 0.1225985437631607,
"num_tokens": 32411444.0,
"reward": 1.1785929203033447,
"reward_std": 0.19305641949176788,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7442941665649414,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8617194890975952,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.6502406597137451,
"adv/mean_abs_reasoning": 0.6026022434234619,
"adv/mean_abs_step_conf": 0.7315477132797241,
"adv/ratio_final_to_reasoning": 1.0790544954158205,
"adv/ratio_step_to_reasoning": 1.2139810650616005,
"adv/std_final_conf": 0.8286058902740479,
"adv/std_reasoning": 0.8265973329544067,
"adv/std_step_conf": 0.935325026512146,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7739368998628258,
"calib/avg_num_step_conf": 4.9609375,
"calib/ece": 0.22099206349206352,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.48412698412698413,
"calib/gap": 0.3839876543209877,
"calib/mean_conf": 0.6066269841269841,
"calib/mu_c": 0.7437654320987654,
"calib/mu_w": 0.35977777777777775,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.09238095238095242,
"calib/std_conf": 0.41818406052623064,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5448609431680774,
"calib/step_q_c_n": 827.0,
"calib/step_q_gap": 0.10422888899200516,
"calib/step_q_w": 0.44063205417607226,
"calib/step_q_w_n": 443.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2434.0,
"completions/max_terminated_length": 2434.0,
"completions/mean_length": 475.62109375,
"completions/mean_terminated_length": 475.62109375,
"completions/min_length": 163.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.03260481357574463,
"kl": 0.0984954833984375,
"learning_rate": 1.6666666666666667e-06,
"loss": -0.0596,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.035779692232608795,
"mask/share_reasoning": 0.8465887904167175,
"mask/share_step_conf": 0.11763153970241547,
"num_tokens": 32638219.0,
"reward": 1.177355408668518,
"reward_std": 0.1992952972650528,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7544379234313965,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8522652387619019,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.652328372001648,
"adv/mean_abs_reasoning": 0.46055957674980164,
"adv/mean_abs_step_conf": 0.7508203983306885,
"adv/ratio_final_to_reasoning": 1.416382168416019,
"adv/ratio_step_to_reasoning": 1.630235123171851,
"adv/std_final_conf": 0.8770766258239746,
"adv/std_reasoning": 0.739296019077301,
"adv/std_step_conf": 0.9351637363433838,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8754497001998668,
"calib/avg_num_step_conf": 5.4921875,
"calib/ece": 0.09928853754940715,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.4980237154150198,
"calib/gap": 0.567321785476349,
"calib/mean_conf": 0.6248221343873518,
"calib/mu_c": 0.8378481012658228,
"calib/mu_w": 0.2705263157894737,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.049802371541502015,
"calib/std_conf": 0.41490851887586083,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5223926380368098,
"calib/step_q_c_n": 815.0,
"calib/step_q_gap": 0.12699500690313803,
"calib/step_q_w": 0.39539763113367177,
"calib/step_q_w_n": 591.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2510.0,
"completions/max_terminated_length": 2510.0,
"completions/mean_length": 523.765625,
"completions/mean_terminated_length": 523.765625,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.1504,
"grad_norm": 0.037485457956790924,
"kl": 0.0937347412109375,
"learning_rate": 1.638888888888889e-06,
"loss": 0.0771,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03306419402360916,
"mask/share_reasoning": 0.8539596199989319,
"mask/share_step_conf": 0.11297617852687836,
"num_tokens": 32879399.0,
"reward": 1.2163152694702148,
"reward_std": 0.1883358359336853,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.8347440958023071,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8532785177230835,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.7393682599067688,
"adv/mean_abs_reasoning": 0.49121570587158203,
"adv/mean_abs_step_conf": 0.7508542537689209,
"adv/ratio_final_to_reasoning": 1.5051804147729368,
"adv/ratio_step_to_reasoning": 1.528563205113022,
"adv/std_final_conf": 0.9064189195632935,
"adv/std_reasoning": 0.7752906084060669,
"adv/std_step_conf": 0.9352670907974243,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8096784363177806,
"calib/avg_num_step_conf": 5.6875,
"calib/ece": 0.1771031746031746,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.43253968253968256,
"calib/gap": 0.4667931904161412,
"calib/mean_conf": 0.5708333333333334,
"calib/mu_c": 0.8116393442622951,
"calib/mu_w": 0.3448461538461539,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.1319047619047619,
"calib/std_conf": 0.4265072479333529,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5488201438848921,
"calib/step_q_c_n": 695.0,
"calib/step_q_gap": 0.1478608797587423,
"calib/step_q_w": 0.4009592641261498,
"calib/step_q_w_n": 761.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1579.0,
"completions/max_terminated_length": 1579.0,
"completions/mean_length": 504.42578125,
"completions/mean_terminated_length": 506.4039611816406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.03455263748764992,
"kl": 0.09305572509765625,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.0546,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03336789086461067,
"mask/share_reasoning": 0.8411300182342529,
"mask/share_step_conf": 0.12159579992294312,
"num_tokens": 33113692.0,
"reward": 1.1650047302246094,
"reward_std": 0.22650645673274994,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.7737703323364258,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8437427282333374,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.6579672694206238,
"adv/mean_abs_reasoning": 0.5036629438400269,
"adv/mean_abs_step_conf": 0.7606317400932312,
"adv/ratio_final_to_reasoning": 1.306364260995955,
"adv/ratio_step_to_reasoning": 1.5101999251603124,
"adv/std_final_conf": 0.8390932083129883,
"adv/std_reasoning": 0.7575718760490417,
"adv/std_step_conf": 0.9355660676956177,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8124918672739101,
"calib/avg_num_step_conf": 5.4296875,
"calib/ece": 0.18055776892430278,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.4262948207171315,
"calib/gap": 0.41892843201040986,
"calib/mean_conf": 0.5998406374501992,
"calib/mu_c": 0.7767586206896552,
"calib/mu_w": 0.3578301886792453,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.10135458167330677,
"calib/std_conf": 0.4079225066312555,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5018259803921569,
"calib/step_q_c_n": 816.0,
"calib/step_q_gap": 0.11198560582769695,
"calib/step_q_w": 0.3898403745644599,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1895.0,
"completions/max_terminated_length": 1895.0,
"completions/mean_length": 495.2265625,
"completions/mean_terminated_length": 497.1686706542969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.3523845970630646,
"kl": 0.6697616577148438,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.0899,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03295125439763069,
"mask/share_reasoning": 0.8419654965400696,
"mask/share_step_conf": 0.12117701023817062,
"num_tokens": 33347806.0,
"reward": 1.1694198846817017,
"reward_std": 0.20234212279319763,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7747402191162109,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.837003767490387,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.6553045511245728,
"adv/mean_abs_reasoning": 0.5001832842826843,
"adv/mean_abs_step_conf": 0.7231369018554688,
"adv/ratio_final_to_reasoning": 1.310128850196081,
"adv/ratio_step_to_reasoning": 1.445743839465814,
"adv/std_final_conf": 0.8448795676231384,
"adv/std_reasoning": 0.7576124668121338,
"adv/std_step_conf": 0.9357913136482239,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7382836071069394,
"calib/avg_num_step_conf": 5.56640625,
"calib/ece": 0.2080158730158729,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5198412698412699,
"calib/gap": 0.37706469996647674,
"calib/mean_conf": 0.6242857142857142,
"calib/mu_c": 0.7664331210191083,
"calib/mu_w": 0.3893684210526316,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.10464285714285702,
"calib/std_conf": 0.42411051377106473,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5042337164750957,
"calib/step_q_c_n": 870.0,
"calib/step_q_gap": 0.10962110386248308,
"calib/step_q_w": 0.39461261261261266,
"calib/step_q_w_n": 555.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1939.0,
"completions/max_terminated_length": 1939.0,
"completions/mean_length": 493.74609375,
"completions/mean_terminated_length": 493.74609375,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"epoch": 0.1536,
"grad_norm": 0.035648465156555176,
"kl": 0.1121368408203125,
"learning_rate": 1.5555555555555558e-06,
"loss": 0.036,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03582587465643883,
"mask/share_reasoning": 0.8412730097770691,
"mask/share_step_conf": 0.12290111184120178,
"num_tokens": 33578333.0,
"reward": 1.1678181886672974,
"reward_std": 0.21501518785953522,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7465636730194092,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8468817472457886,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.6195697784423828,
"adv/mean_abs_reasoning": 0.4878042936325073,
"adv/mean_abs_step_conf": 0.7531744241714478,
"adv/ratio_final_to_reasoning": 1.2701195674779002,
"adv/ratio_step_to_reasoning": 1.5440094193571405,
"adv/std_final_conf": 0.8374097347259521,
"adv/std_reasoning": 0.7394381761550903,
"adv/std_step_conf": 0.9352357983589172,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8002614379084968,
"calib/avg_num_step_conf": 6.0625,
"calib/ece": 0.14268774703557313,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.49407114624505927,
"calib/gap": 0.41498169934640516,
"calib/mean_conf": 0.6635573122529644,
"calib/mu_c": 0.8275816993464051,
"calib/mu_w": 0.41259999999999997,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.10075098814229248,
"calib/std_conf": 0.38478574921930536,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5201180637544274,
"calib/step_q_c_n": 847.0,
"calib/step_q_gap": 0.14028827652038484,
"calib/step_q_w": 0.37982978723404254,
"calib/step_q_w_n": 705.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2910.0,
"completions/max_terminated_length": 2910.0,
"completions/mean_length": 488.4765625,
"completions/mean_terminated_length": 488.4765625,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.03285250440239906,
"kl": 0.094085693359375,
"learning_rate": 1.527777777777778e-06,
"loss": 0.1492,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03602126985788345,
"mask/share_reasoning": 0.8300851583480835,
"mask/share_step_conf": 0.13389351963996887,
"num_tokens": 33806087.0,
"reward": 1.1905624866485596,
"reward_std": 0.19327059388160706,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7945300340652466,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8467923402786255,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.7271376848220825,
"adv/mean_abs_reasoning": 0.5126863121986389,
"adv/mean_abs_step_conf": 0.7399911880493164,
"adv/ratio_final_to_reasoning": 1.4182896393386741,
"adv/ratio_step_to_reasoning": 1.4433605314639428,
"adv/std_final_conf": 0.8864182233810425,
"adv/std_reasoning": 0.7927311062812805,
"adv/std_step_conf": 0.9353997111320496,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7329771290806021,
"calib/avg_num_step_conf": 6.0234375,
"calib/ece": 0.23456349206349206,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4126984126984127,
"calib/gap": 0.34000716752459753,
"calib/mean_conf": 0.5587698412698413,
"calib/mu_c": 0.7598058252427183,
"calib/mu_w": 0.4197986577181208,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1923015873015873,
"calib/std_conf": 0.4220016486595798,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.49958400000000003,
"calib/step_q_c_n": 625.0,
"calib/step_q_gap": 0.10310635550708835,
"calib/step_q_w": 0.3964776444929117,
"calib/step_q_w_n": 917.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1837.0,
"completions/max_terminated_length": 1837.0,
"completions/mean_length": 511.69140625,
"completions/mean_terminated_length": 515.720458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.04114718735218048,
"kl": 0.105072021484375,
"learning_rate": 1.5e-06,
"loss": 0.0185,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.032341986894607544,
"mask/share_reasoning": 0.8382286429405212,
"mask/share_step_conf": 0.12161687016487122,
"num_tokens": 34044296.0,
"reward": 1.1270300149917603,
"reward_std": 0.2149513065814972,
"rewards/accuracy_reward_step": 0.40234375,
"rewards/final_brier_reward_step": 0.7105585932731628,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8446260094642639,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.6262341737747192,
"adv/mean_abs_reasoning": 0.40930497646331787,
"adv/mean_abs_step_conf": 0.7368854284286499,
"adv/ratio_final_to_reasoning": 1.5299940381517514,
"adv/ratio_step_to_reasoning": 1.8003334208048407,
"adv/std_final_conf": 0.8180379867553711,
"adv/std_reasoning": 0.7014212608337402,
"adv/std_step_conf": 0.9353666305541992,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7938918988648092,
"calib/avg_num_step_conf": 5.8125,
"calib/ece": 0.21275999999999995,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.54,
"calib/gap": 0.47253869969040235,
"calib/mean_conf": 0.60636,
"calib/mu_c": 0.8634210526315789,
"calib/mu_w": 0.3908823529411765,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.18155999999999994,
"calib/std_conf": 0.44288141798905944,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5424237804878049,
"calib/step_q_c_n": 656.0,
"calib/step_q_gap": 0.1334093574108819,
"calib/step_q_w": 0.40901442307692304,
"calib/step_q_w_n": 832.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2295.0,
"completions/max_terminated_length": 2295.0,
"completions/mean_length": 536.19140625,
"completions/mean_terminated_length": 536.19140625,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.1568,
"grad_norm": 0.04610983282327652,
"kl": 0.0912017822265625,
"learning_rate": 1.4722222222222225e-06,
"loss": -0.0097,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03225596994161606,
"mask/share_reasoning": 0.8475143313407898,
"mask/share_step_conf": 0.12022969126701355,
"num_tokens": 34285241.0,
"reward": 1.1384716033935547,
"reward_std": 0.26133859157562256,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.7456488013267517,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8328420519828796,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.5644453763961792,
"adv/mean_abs_reasoning": 0.3946724534034729,
"adv/mean_abs_step_conf": 0.7691161036491394,
"adv/ratio_final_to_reasoning": 1.4301615720293197,
"adv/ratio_step_to_reasoning": 1.948745338106669,
"adv/std_final_conf": 0.7765293717384338,
"adv/std_reasoning": 0.6815993189811707,
"adv/std_step_conf": 0.9353897571563721,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7558139534883721,
"calib/avg_num_step_conf": 5.8203125,
"calib/ece": 0.1956521739130434,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6324110671936759,
"calib/gap": 0.38987438989376966,
"calib/mean_conf": 0.7222134387351778,
"calib/mu_c": 0.8470348837209302,
"calib/mu_w": 0.4571604938271605,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.11901185770750983,
"calib/std_conf": 0.40146549955043687,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.494751968503937,
"calib/step_q_c_n": 1016.0,
"calib/step_q_gap": 0.08905576597229142,
"calib/step_q_w": 0.40569620253164557,
"calib/step_q_w_n": 474.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1434.0,
"completions/max_terminated_length": 1434.0,
"completions/mean_length": 476.58203125,
"completions/mean_terminated_length": 478.4510192871094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.04696307331323624,
"kl": 0.0993499755859375,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.0649,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.036317791789770126,
"mask/share_reasoning": 0.8251909017562866,
"mask/share_step_conf": 0.13458506762981415,
"num_tokens": 34512358.0,
"reward": 1.1845102310180664,
"reward_std": 0.16278135776519775,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.7798437476158142,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8380970358848572,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.6602301001548767,
"adv/mean_abs_reasoning": 0.5817610025405884,
"adv/mean_abs_step_conf": 0.7569714784622192,
"adv/ratio_final_to_reasoning": 1.134882017308841,
"adv/ratio_step_to_reasoning": 1.3011726037951585,
"adv/std_final_conf": 0.8782934546470642,
"adv/std_reasoning": 0.8265743255615234,
"adv/std_step_conf": 0.9354011416435242,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8224085855664803,
"calib/avg_num_step_conf": 5.40234375,
"calib/ece": 0.1583466666666667,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.552,
"calib/gap": 0.5193470856628752,
"calib/mean_conf": 0.6600533333333333,
"calib/mu_c": 0.903107769423559,
"calib/mu_w": 0.3837606837606837,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.14320000000000005,
"calib/std_conf": 0.41802636471878185,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5346911764705883,
"calib/step_q_c_n": 680.0,
"calib/step_q_gap": 0.1386883315203749,
"calib/step_q_w": 0.3960028449502134,
"calib/step_q_w_n": 703.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1900.0,
"completions/max_terminated_length": 1900.0,
"completions/mean_length": 507.0546875,
"completions/mean_terminated_length": 507.0546875,
"completions/min_length": 101.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.03461736813187599,
"kl": 0.09224700927734375,
"learning_rate": 1.4166666666666667e-06,
"loss": 0.0572,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03385186567902565,
"mask/share_reasoning": 0.8462405204772949,
"mask/share_step_conf": 0.11990756541490555,
"num_tokens": 34746620.0,
"reward": 1.1829655170440674,
"reward_std": 0.23066285252571106,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7954006195068359,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8485827445983887,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.7037779688835144,
"adv/mean_abs_reasoning": 0.5218222141265869,
"adv/mean_abs_step_conf": 0.7530713081359863,
"adv/ratio_final_to_reasoning": 1.3486930027719892,
"adv/ratio_step_to_reasoning": 1.4431568602276896,
"adv/std_final_conf": 0.8822007775306702,
"adv/std_reasoning": 0.7575706839561462,
"adv/std_step_conf": 0.9354367256164551,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7739515198153135,
"calib/avg_num_step_conf": 5.49609375,
"calib/ece": 0.2362549800796813,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6215139442231076,
"calib/gap": 0.38889444658201877,
"calib/mean_conf": 0.7207171314741037,
"calib/mu_c": 0.8957971014492754,
"calib/mu_w": 0.5069026548672566,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.20358565737051795,
"calib/std_conf": 0.40392432980510673,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5245503355704698,
"calib/step_q_c_n": 745.0,
"calib/step_q_gap": 0.13456544131065107,
"calib/step_q_w": 0.38998489425981875,
"calib/step_q_w_n": 662.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2144.0,
"completions/max_terminated_length": 2144.0,
"completions/mean_length": 460.9921875,
"completions/mean_terminated_length": 460.9921875,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.16,
"grad_norm": 0.0454990491271019,
"kl": 0.10170745849609375,
"learning_rate": 1.3888888888888892e-06,
"loss": 0.025,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.0382879413664341,
"mask/share_reasoning": 0.8248406648635864,
"mask/share_step_conf": 0.1368713676929474,
"num_tokens": 34969594.0,
"reward": 1.1438206434249878,
"reward_std": 0.2367713451385498,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7262125015258789,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8399109244346619,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.7293127775192261,
"adv/mean_abs_reasoning": 0.5311736464500427,
"adv/mean_abs_step_conf": 0.7511847019195557,
"adv/ratio_final_to_reasoning": 1.3730213883790985,
"adv/ratio_step_to_reasoning": 1.4141979876823674,
"adv/std_final_conf": 0.9007099866867065,
"adv/std_reasoning": 0.7754827737808228,
"adv/std_step_conf": 0.9359373450279236,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7826836728053621,
"calib/avg_num_step_conf": 5.74609375,
"calib/ece": 0.18302419354838714,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.4717741935483871,
"calib/gap": 0.4641276761892367,
"calib/mean_conf": 0.5780241935483871,
"calib/mu_c": 0.815702479338843,
"calib/mu_w": 0.35157480314960626,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.13657258064516137,
"calib/std_conf": 0.4409535664917214,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.4977232142857143,
"calib/step_q_c_n": 672.0,
"calib/step_q_gap": 0.14567064857858042,
"calib/step_q_w": 0.3520525657071339,
"calib/step_q_w_n": 799.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2129.0,
"completions/max_terminated_length": 2129.0,
"completions/mean_length": 524.390625,
"completions/mean_terminated_length": 526.4470825195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.03694000095129013,
"kl": 0.0893096923828125,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.0521,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03346676751971245,
"mask/share_reasoning": 0.8463730216026306,
"mask/share_step_conf": 0.11625394225120544,
"num_tokens": 35210862.0,
"reward": 1.126600742340088,
"reward_std": 0.23628029227256775,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.751246452331543,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8096367120742798,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.7440125942230225,
"adv/mean_abs_reasoning": 0.6250966787338257,
"adv/mean_abs_step_conf": 0.7569712400436401,
"adv/ratio_final_to_reasoning": 1.1902360379358738,
"adv/ratio_step_to_reasoning": 1.210966664511696,
"adv/std_final_conf": 0.8971284627914429,
"adv/std_reasoning": 0.8266865015029907,
"adv/std_step_conf": 0.9358736276626587,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7288417966146875,
"calib/avg_num_step_conf": 5.47265625,
"calib/ece": 0.23567346938775513,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.40408163265306124,
"calib/gap": 0.35499533519925375,
"calib/mean_conf": 0.553795918367347,
"calib/mu_c": 0.730569105691057,
"calib/mu_w": 0.3755737704918033,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.14371428571428574,
"calib/std_conf": 0.4257429031412937,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.46891547049441784,
"calib/step_q_c_n": 627.0,
"calib/step_q_gap": 0.12126689168304827,
"calib/step_q_w": 0.34764857881136957,
"calib/step_q_w_n": 774.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2545.0,
"completions/max_terminated_length": 2545.0,
"completions/mean_length": 497.62890625,
"completions/mean_terminated_length": 499.5804138183594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.038726769387722015,
"kl": 0.09711456298828125,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.11,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03548308089375496,
"mask/share_reasoning": 0.8377888202667236,
"mask/share_step_conf": 0.1228218674659729,
"num_tokens": 35443647.0,
"reward": 1.0959519147872925,
"reward_std": 0.2796997129917145,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.7023949027061462,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.8029018640518188,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.718186616897583,
"adv/mean_abs_reasoning": 0.5393495559692383,
"adv/mean_abs_step_conf": 0.7579127550125122,
"adv/ratio_final_to_reasoning": 1.3315791381471809,
"adv/ratio_step_to_reasoning": 1.4052347807174974,
"adv/std_final_conf": 0.8757449388504028,
"adv/std_reasoning": 0.7927350997924805,
"adv/std_step_conf": 0.9352637529373169,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6873619233268355,
"calib/avg_num_step_conf": 6.203125,
"calib/ece": 0.3093172690763052,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.3534136546184739,
"calib/gap": 0.25028849902534117,
"calib/mean_conf": 0.48955823293172696,
"calib/mu_c": 0.6041481481481482,
"calib/mu_w": 0.35385964912280704,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.12835341365461844,
"calib/std_conf": 0.430426385244852,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.46790502793296096,
"calib/step_q_c_n": 716.0,
"calib/step_q_gap": 0.13672383527241055,
"calib/step_q_w": 0.3311811926605504,
"calib/step_q_w_n": 872.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2287.0,
"completions/max_terminated_length": 2287.0,
"completions/mean_length": 515.41015625,
"completions/mean_terminated_length": 517.431396484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.1632,
"grad_norm": 0.03288364037871361,
"kl": 0.09600830078125,
"learning_rate": 1.3055555555555556e-06,
"loss": 0.0006,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03258873149752617,
"mask/share_reasoning": 0.8409442901611328,
"mask/share_step_conf": 0.1225607693195343,
"num_tokens": 35682912.0,
"reward": 1.0964607000350952,
"reward_std": 0.2279350906610489,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6613730192184448,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8225946426391602,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.6926400661468506,
"adv/mean_abs_reasoning": 0.46163809299468994,
"adv/mean_abs_step_conf": 0.7453674077987671,
"adv/ratio_final_to_reasoning": 1.5003962555464803,
"adv/ratio_step_to_reasoning": 1.6146141731144807,
"adv/std_final_conf": 0.8909844756126404,
"adv/std_reasoning": 0.7392441034317017,
"adv/std_step_conf": 0.9355279803276062,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.730795739348371,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.21984189723320158,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.45849802371541504,
"calib/gap": 0.41358583959899753,
"calib/mean_conf": 0.558498023715415,
"calib/mu_c": 0.7759166666666667,
"calib/mu_w": 0.36233082706766917,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15201581027667985,
"calib/std_conf": 0.44135180973523674,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5033224222585925,
"calib/step_q_c_n": 611.0,
"calib/step_q_gap": 0.12163286181903205,
"calib/step_q_w": 0.3816895604395604,
"calib/step_q_w_n": 728.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2423.0,
"completions/max_terminated_length": 2423.0,
"completions/mean_length": 469.078125,
"completions/mean_terminated_length": 470.91766357421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.0505913607776165,
"kl": 0.112213134765625,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.0439,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03482554107904434,
"mask/share_reasoning": 0.8422807455062866,
"mask/share_step_conf": 0.11898745596408844,
"num_tokens": 35907436.0,
"reward": 1.1507153511047363,
"reward_std": 0.2105042189359665,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7422734498977661,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8456881642341614,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.770971417427063,
"adv/mean_abs_reasoning": 0.47824519872665405,
"adv/mean_abs_step_conf": 0.7767425775527954,
"adv/ratio_final_to_reasoning": 1.6120839675543082,
"adv/ratio_step_to_reasoning": 1.624151334129233,
"adv/std_final_conf": 0.9311116337776184,
"adv/std_reasoning": 0.7392560243606567,
"adv/std_step_conf": 0.9357247352600098,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6687376968503935,
"calib/avg_num_step_conf": 5.59375,
"calib/ece": 0.252,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2784313725490196,
"calib/gap": 0.27756828248031495,
"calib/mean_conf": 0.4494117647058824,
"calib/mu_c": 0.5887401574803149,
"calib/mu_w": 0.311171875,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10168627450980394,
"calib/std_conf": 0.4219581419845835,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4082369942196532,
"calib/step_q_c_n": 692.0,
"calib/step_q_gap": 0.07377753476019372,
"calib/step_q_w": 0.3344594594594595,
"calib/step_q_w_n": 740.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2650.0,
"completions/max_terminated_length": 2650.0,
"completions/mean_length": 453.2890625,
"completions/mean_terminated_length": 453.2890625,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.039521224796772,
"kl": 0.1176300048828125,
"learning_rate": 1.25e-06,
"loss": 0.0123,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03726223111152649,
"mask/share_reasoning": 0.8310775756835938,
"mask/share_step_conf": 0.13166022300720215,
"num_tokens": 36130694.0,
"reward": 1.1223604679107666,
"reward_std": 0.20777511596679688,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7056055068969727,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8271185159683228,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.699209451675415,
"adv/mean_abs_reasoning": 0.4771537184715271,
"adv/mean_abs_step_conf": 0.7692943811416626,
"adv/ratio_final_to_reasoning": 1.465375673724606,
"adv/ratio_step_to_reasoning": 1.6122569129419206,
"adv/std_final_conf": 0.8949079513549805,
"adv/std_reasoning": 0.7392115592956543,
"adv/std_step_conf": 0.9355168342590332,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7784376571141276,
"calib/avg_num_step_conf": 5.9296875,
"calib/ece": 0.23126482213438737,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.48616600790513836,
"calib/gap": 0.4132151835093013,
"calib/mean_conf": 0.5733201581027668,
"calib/mu_c": 0.7644117647058825,
"calib/mu_w": 0.35119658119658115,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1335177865612648,
"calib/std_conf": 0.4449177703800726,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.44553410553410555,
"calib/step_q_c_n": 777.0,
"calib/step_q_gap": 0.11217378164746589,
"calib/step_q_w": 0.33336032388663966,
"calib/step_q_w_n": 741.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1866.0,
"completions/max_terminated_length": 1866.0,
"completions/mean_length": 472.9140625,
"completions/mean_terminated_length": 474.7686462402344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.1664,
"grad_norm": 0.03909357264637947,
"kl": 0.10540771484375,
"learning_rate": 1.2222222222222223e-06,
"loss": -0.0407,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.035646237432956696,
"mask/share_reasoning": 0.827398419380188,
"mask/share_step_conf": 0.1330491006374359,
"num_tokens": 36356520.0,
"reward": 1.1607627868652344,
"reward_std": 0.20456355810165405,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7487425804138184,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.845917820930481,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.6460797190666199,
"adv/mean_abs_reasoning": 0.4546028971672058,
"adv/mean_abs_step_conf": 0.7391088604927063,
"adv/ratio_final_to_reasoning": 1.4211957800809785,
"adv/ratio_step_to_reasoning": 1.62583403031164,
"adv/std_final_conf": 0.8762221932411194,
"adv/std_reasoning": 0.7391842007637024,
"adv/std_step_conf": 0.9354991316795349,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7898527528809218,
"calib/avg_num_step_conf": 6.56640625,
"calib/ece": 0.18361111111111106,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.49206349206349204,
"calib/gap": 0.5060909090909091,
"calib/mean_conf": 0.5740873015873017,
"calib/mu_c": 0.795,
"calib/mu_w": 0.28890909090909095,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.09710317460317458,
"calib/std_conf": 0.44950453358310655,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.46499999999999997,
"calib/step_q_c_n": 928.0,
"calib/step_q_gap": 0.12450863213811414,
"calib/step_q_w": 0.34049136786188583,
"calib/step_q_w_n": 753.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2257.0,
"completions/max_terminated_length": 2257.0,
"completions/mean_length": 498.671875,
"completions/mean_terminated_length": 498.671875,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.07070081681013107,
"kl": 0.099639892578125,
"learning_rate": 1.1944444444444446e-06,
"loss": 0.0253,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03461083769798279,
"mask/share_reasoning": 0.8251791000366211,
"mask/share_step_conf": 0.1402100920677185,
"num_tokens": 36587908.0,
"reward": 1.1632893085479736,
"reward_std": 0.20778614282608032,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7844120860099792,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8229026794433594,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.7103238701820374,
"adv/mean_abs_reasoning": 0.48325201869010925,
"adv/mean_abs_step_conf": 0.771677553653717,
"adv/ratio_final_to_reasoning": 1.4698828824500791,
"adv/ratio_step_to_reasoning": 1.5968428973052338,
"adv/std_final_conf": 0.8656406998634338,
"adv/std_reasoning": 0.7394470572471619,
"adv/std_step_conf": 0.935847282409668,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7064367816091954,
"calib/avg_num_step_conf": 5.6484375,
"calib/ece": 0.237,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.472,
"calib/gap": 0.3402331691297209,
"calib/mean_conf": 0.60724,
"calib/mu_c": 0.7501379310344828,
"calib/mu_w": 0.4099047619047619,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13212,
"calib/std_conf": 0.4316201830313314,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.48336461126005364,
"calib/step_q_c_n": 746.0,
"calib/step_q_gap": 0.12709318268862502,
"calib/step_q_w": 0.3562714285714286,
"calib/step_q_w_n": 700.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2653.0,
"completions/max_terminated_length": 2653.0,
"completions/mean_length": 493.94921875,
"completions/mean_terminated_length": 493.94921875,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.031547460705041885,
"kl": 0.10711669921875,
"learning_rate": 1.1666666666666668e-06,
"loss": 0.0507,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03697506710886955,
"mask/share_reasoning": 0.8330240249633789,
"mask/share_step_conf": 0.13000091910362244,
"num_tokens": 36819599.0,
"reward": 1.1219482421875,
"reward_std": 0.25272446870803833,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7178941369056702,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.81160569190979,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.6777149438858032,
"adv/mean_abs_reasoning": 0.5489553213119507,
"adv/mean_abs_step_conf": 0.7470642924308777,
"adv/ratio_final_to_reasoning": 1.2345539201006912,
"adv/ratio_step_to_reasoning": 1.3608835973125564,
"adv/std_final_conf": 0.8770651817321777,
"adv/std_reasoning": 0.7754817008972168,
"adv/std_step_conf": 0.9351387619972229,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7349534777799466,
"calib/avg_num_step_conf": 5.12890625,
"calib/ece": 0.23884000000000002,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.436,
"calib/gap": 0.3636664714685406,
"calib/mean_conf": 0.55364,
"calib/mu_c": 0.7121985815602837,
"calib/mu_w": 0.3485321100917431,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.11424000000000001,
"calib/std_conf": 0.44136691131075967,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4798344827586207,
"calib/step_q_c_n": 725.0,
"calib/step_q_gap": 0.13508958479943706,
"calib/step_q_w": 0.3447448979591836,
"calib/step_q_w_n": 588.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1677.0,
"completions/max_terminated_length": 1677.0,
"completions/mean_length": 432.18359375,
"completions/mean_terminated_length": 435.58660888671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.1696,
"grad_norm": 0.05171238258481026,
"kl": 0.1108245849609375,
"learning_rate": 1.138888888888889e-06,
"loss": -0.0979,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03822503238916397,
"mask/share_reasoning": 0.8270611763000488,
"mask/share_step_conf": 0.1269012987613678,
"num_tokens": 37035022.0,
"reward": 1.1491003036499023,
"reward_std": 0.22951547801494598,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7168339490890503,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8511192798614502,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.7331618666648865,
"adv/mean_abs_reasoning": 0.559362530708313,
"adv/mean_abs_step_conf": 0.7502003908157349,
"adv/ratio_final_to_reasoning": 1.3107096496729838,
"adv/ratio_step_to_reasoning": 1.341170260127661,
"adv/std_final_conf": 0.9073959589004517,
"adv/std_reasoning": 0.792900562286377,
"adv/std_step_conf": 0.9360281825065613,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7023730830248547,
"calib/avg_num_step_conf": 6.2578125,
"calib/ece": 0.2711382113821138,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.43089430894308944,
"calib/gap": 0.3433593336858805,
"calib/mean_conf": 0.5052032520325204,
"calib/mu_c": 0.6782786885245902,
"calib/mu_w": 0.33491935483870966,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.14020325203252035,
"calib/std_conf": 0.4591258034613807,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4382012195121951,
"calib/step_q_c_n": 656.0,
"calib/step_q_gap": 0.11328578610838957,
"calib/step_q_w": 0.3249154334038055,
"calib/step_q_w_n": 946.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2457.0,
"completions/max_terminated_length": 2457.0,
"completions/mean_length": 507.83203125,
"completions/mean_terminated_length": 511.8307189941406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.03477999195456505,
"kl": 0.110809326171875,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0361,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03321065753698349,
"mask/share_reasoning": 0.8289576768875122,
"mask/share_step_conf": 0.1300192028284073,
"num_tokens": 37269867.0,
"reward": 1.0891594886779785,
"reward_std": 0.2826342284679413,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6830366849899292,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8057091236114502,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.666127622127533,
"adv/mean_abs_reasoning": 0.4846762716770172,
"adv/mean_abs_step_conf": 0.7656421661376953,
"adv/ratio_final_to_reasoning": 1.3743763849273662,
"adv/ratio_step_to_reasoning": 1.5796980600855794,
"adv/std_final_conf": 0.8550475239753723,
"adv/std_reasoning": 0.739328145980835,
"adv/std_step_conf": 0.9354023337364197,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7183908045977012,
"calib/avg_num_step_conf": 5.25,
"calib/ece": 0.29035573122529645,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.41106719367588934,
"calib/gap": 0.3515240797322857,
"calib/mean_conf": 0.5158102766798419,
"calib/mu_c": 0.6255747126436781,
"calib/mu_w": 0.2740506329113924,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.059209486166007914,
"calib/std_conf": 0.4429900153822679,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4532934782608695,
"calib/step_q_c_n": 920.0,
"calib/step_q_gap": 0.10897744052502045,
"calib/step_q_w": 0.34431603773584907,
"calib/step_q_w_n": 424.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1863.0,
"completions/max_terminated_length": 1863.0,
"completions/mean_length": 451.203125,
"completions/mean_terminated_length": 451.203125,
"completions/min_length": 103.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.06945797055959702,
"kl": 0.1070098876953125,
"learning_rate": 1.0833333333333335e-06,
"loss": 0.0184,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03743017092347145,
"mask/share_reasoning": 0.8344805240631104,
"mask/share_step_conf": 0.12808924913406372,
"num_tokens": 37489295.0,
"reward": 1.1474106311798096,
"reward_std": 0.19529061019420624,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.6996968984603882,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8415412306785583,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.6987650394439697,
"adv/mean_abs_reasoning": 0.584774374961853,
"adv/mean_abs_step_conf": 0.7395343780517578,
"adv/ratio_final_to_reasoning": 1.1949310184625528,
"adv/ratio_step_to_reasoning": 1.2646490847004033,
"adv/std_final_conf": 0.902337372303009,
"adv/std_reasoning": 0.8099278807640076,
"adv/std_step_conf": 0.9356653094291687,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7526661197703035,
"calib/avg_num_step_conf": 5.1796875,
"calib/ece": 0.2203187250996015,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.545816733067729,
"calib/gap": 0.4302433688815969,
"calib/mean_conf": 0.6168924302788844,
"calib/mu_c": 0.7745911949685534,
"calib/mu_w": 0.3443478260869565,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.10187250996015929,
"calib/std_conf": 0.44126031062801013,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4804425837320574,
"calib/step_q_c_n": 836.0,
"calib/step_q_gap": 0.08754462454838391,
"calib/step_q_w": 0.3928979591836735,
"calib/step_q_w_n": 490.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2978.0,
"completions/max_terminated_length": 2978.0,
"completions/mean_length": 443.77734375,
"completions/mean_terminated_length": 445.5176696777344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.1728,
"grad_norm": 0.04064112529158592,
"kl": 0.1116790771484375,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.0137,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.0373004712164402,
"mask/share_reasoning": 0.8302710652351379,
"mask/share_step_conf": 0.12852223217487335,
"num_tokens": 37707046.0,
"reward": 1.1640316247940063,
"reward_std": 0.22133593261241913,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7575312852859497,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8329587578773499,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.6372281312942505,
"adv/mean_abs_reasoning": 0.5325202941894531,
"adv/mean_abs_step_conf": 0.7637654542922974,
"adv/ratio_final_to_reasoning": 1.1966269421978983,
"adv/ratio_step_to_reasoning": 1.43424666181938,
"adv/std_final_conf": 0.8429669737815857,
"adv/std_reasoning": 0.7576155662536621,
"adv/std_step_conf": 0.9351708292961121,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7866588495863462,
"calib/avg_num_step_conf": 5.94140625,
"calib/ece": 0.2068951612903226,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.39919354838709675,
"calib/gap": 0.46308188391635724,
"calib/mean_conf": 0.5001209677419355,
"calib/mu_c": 0.7223255813953489,
"calib/mu_w": 0.2592436974789916,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.09342741935483871,
"calib/std_conf": 0.45073281000163135,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.47199481865284976,
"calib/step_q_c_n": 772.0,
"calib/step_q_gap": 0.123129665114799,
"calib/step_q_w": 0.34886515353805075,
"calib/step_q_w_n": 749.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2149.0,
"completions/max_terminated_length": 2149.0,
"completions/mean_length": 503.72265625,
"completions/mean_terminated_length": 505.69805908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.03318033367395401,
"kl": 0.101806640625,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.0523,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03390524163842201,
"mask/share_reasoning": 0.8307154178619385,
"mask/share_step_conf": 0.13147307932376862,
"num_tokens": 37940831.0,
"reward": 1.1570065021514893,
"reward_std": 0.1966712772846222,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7536964416503906,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8433359861373901,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.6957120895385742,
"adv/mean_abs_reasoning": 0.48879551887512207,
"adv/mean_abs_step_conf": 0.7435122132301331,
"adv/ratio_final_to_reasoning": 1.423319287254586,
"adv/ratio_step_to_reasoning": 1.5211109441862258,
"adv/std_final_conf": 0.8903563618659973,
"adv/std_reasoning": 0.7753127217292786,
"adv/std_step_conf": 0.9353837370872498,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7536496350364963,
"calib/avg_num_step_conf": 5.953125,
"calib/ece": 0.258102766798419,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3438735177865613,
"calib/gap": 0.3506519003272086,
"calib/mean_conf": 0.46229249011857704,
"calib/mu_c": 0.6230656934306569,
"calib/mu_w": 0.27241379310344827,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.08944664031620557,
"calib/std_conf": 0.43647650628239665,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4358097686375321,
"calib/step_q_c_n": 778.0,
"calib/step_q_gap": 0.11571593485737125,
"calib/step_q_w": 0.3200938337801609,
"calib/step_q_w_n": 746.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1908.0,
"completions/max_terminated_length": 1908.0,
"completions/mean_length": 524.1171875,
"completions/mean_terminated_length": 526.172607421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.04268093407154083,
"kl": 0.107574462890625,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0644,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.031301628798246384,
"mask/share_reasoning": 0.8485506772994995,
"mask/share_step_conf": 0.1162414401769638,
"num_tokens": 38181141.0,
"reward": 1.1473987102508545,
"reward_std": 0.2089129090309143,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7205109596252441,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8463993072509766,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.7059795260429382,
"adv/mean_abs_reasoning": 0.5269777178764343,
"adv/mean_abs_step_conf": 0.7742398977279663,
"adv/ratio_final_to_reasoning": 1.339676236953298,
"adv/ratio_step_to_reasoning": 1.469208035679243,
"adv/std_final_conf": 0.8767625689506531,
"adv/std_reasoning": 0.775352954864502,
"adv/std_step_conf": 0.9353828430175781,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7407479421867911,
"calib/avg_num_step_conf": 5.82421875,
"calib/ece": 0.23928000000000002,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.452,
"calib/gap": 0.4209799727785339,
"calib/mean_conf": 0.5307999999999999,
"calib/mu_c": 0.7648648648648648,
"calib/mu_w": 0.34388489208633094,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.16304,
"calib/std_conf": 0.45047459417818453,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.44881913303437965,
"calib/step_q_c_n": 669.0,
"calib/step_q_gap": 0.08493835444557185,
"calib/step_q_w": 0.3638807785888078,
"calib/step_q_w_n": 822.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2433.0,
"completions/max_terminated_length": 2433.0,
"completions/mean_length": 513.6484375,
"completions/mean_terminated_length": 515.6627807617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.176,
"grad_norm": 0.03866741061210632,
"kl": 0.107147216796875,
"learning_rate": 9.722222222222224e-07,
"loss": -0.0191,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03287225589156151,
"mask/share_reasoning": 0.8358644247055054,
"mask/share_step_conf": 0.12735706567764282,
"num_tokens": 38418211.0,
"reward": 1.1342720985412598,
"reward_std": 0.22282421588897705,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.7291538715362549,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8387600779533386,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.6282885074615479,
"adv/mean_abs_reasoning": 0.5605138540267944,
"adv/mean_abs_step_conf": 0.7245954275131226,
"adv/ratio_final_to_reasoning": 1.1209152154721114,
"adv/ratio_step_to_reasoning": 1.292734197928468,
"adv/std_final_conf": 0.8683748841285706,
"adv/std_reasoning": 0.8098090291023254,
"adv/std_step_conf": 0.9353573322296143,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8162691885964912,
"calib/avg_num_step_conf": 6.4765625,
"calib/ece": 0.19221774193548385,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.4959677419354839,
"calib/gap": 0.5316611842105263,
"calib/mean_conf": 0.5836693548387096,
"calib/mu_c": 0.7894736842105263,
"calib/mu_w": 0.2578125,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.08149193548387097,
"calib/std_conf": 0.45787799864321854,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.46612267250821465,
"calib/step_q_c_n": 913.0,
"calib/step_q_gap": 0.14354549130016098,
"calib/step_q_w": 0.3225771812080537,
"calib/step_q_w_n": 745.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2493.0,
"completions/max_terminated_length": 2493.0,
"completions/mean_length": 525.0,
"completions/mean_terminated_length": 527.058837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.04430662468075752,
"kl": 0.10025787353515625,
"learning_rate": 9.444444444444445e-07,
"loss": 0.039,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.032747916877269745,
"mask/share_reasoning": 0.8334342241287231,
"mask/share_step_conf": 0.1299116164445877,
"num_tokens": 38658795.0,
"reward": 1.1746551990509033,
"reward_std": 0.23381714522838593,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7690227031707764,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8467544317245483,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.6404507160186768,
"adv/mean_abs_reasoning": 0.5007696151733398,
"adv/mean_abs_step_conf": 0.7638825178146362,
"adv/ratio_final_to_reasoning": 1.2789328597682323,
"adv/ratio_step_to_reasoning": 1.525417067387806,
"adv/std_final_conf": 0.8442684412002563,
"adv/std_reasoning": 0.7394320368766785,
"adv/std_step_conf": 0.9355236887931824,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6737989235165127,
"calib/avg_num_step_conf": 5.484375,
"calib/ece": 0.2622,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.596,
"calib/gap": 0.2984603628148049,
"calib/mean_conf": 0.6941200000000001,
"calib/mu_c": 0.8146979865771812,
"calib/mu_w": 0.5162376237623763,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.18016000000000001,
"calib/std_conf": 0.409020568675953,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.48212837837837835,
"calib/step_q_c_n": 888.0,
"calib/step_q_gap": 0.033213649696207825,
"calib/step_q_w": 0.4489147286821705,
"calib/step_q_w_n": 516.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2470.0,
"completions/max_terminated_length": 2470.0,
"completions/mean_length": 499.2578125,
"completions/mean_terminated_length": 499.2578125,
"completions/min_length": 149.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.032998669892549515,
"kl": 0.0972747802734375,
"learning_rate": 9.166666666666666e-07,
"loss": 0.0018,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.034088607877492905,
"mask/share_reasoning": 0.845373809337616,
"mask/share_step_conf": 0.12053757905960083,
"num_tokens": 38892213.0,
"reward": 1.1221466064453125,
"reward_std": 0.2021704912185669,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7050972580909729,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.818838894367218,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.6557689905166626,
"adv/mean_abs_reasoning": 0.5603863000869751,
"adv/mean_abs_step_conf": 0.7279667854309082,
"adv/ratio_final_to_reasoning": 1.1702088191929096,
"adv/ratio_step_to_reasoning": 1.2990445792802638,
"adv/std_final_conf": 0.8571983575820923,
"adv/std_reasoning": 0.7929483652114868,
"adv/std_step_conf": 0.9356951713562012,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7758103241296519,
"calib/avg_num_step_conf": 5.921875,
"calib/ece": 0.19943775100401606,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.4899598393574297,
"calib/gap": 0.43086434573829513,
"calib/mean_conf": 0.5988755020080322,
"calib/mu_c": 0.7753741496598638,
"calib/mu_w": 0.3445098039215687,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.10397590361445784,
"calib/std_conf": 0.43865294503399493,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4667781908302354,
"calib/step_q_c_n": 807.0,
"calib/step_q_gap": 0.13769497503333838,
"calib/step_q_w": 0.32908321579689703,
"calib/step_q_w_n": 709.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2245.0,
"completions/max_terminated_length": 2245.0,
"completions/mean_length": 522.91796875,
"completions/mean_terminated_length": 527.0354614257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.1792,
"grad_norm": 0.030330434441566467,
"kl": 0.10137939453125,
"learning_rate": 8.88888888888889e-07,
"loss": -0.0273,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03261226788163185,
"mask/share_reasoning": 0.8453991413116455,
"mask/share_step_conf": 0.11417609453201294,
"num_tokens": 39130752.0,
"reward": 1.1684155464172363,
"reward_std": 0.22862845659255981,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7529062032699585,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8496999740600586,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.6607838869094849,
"adv/mean_abs_reasoning": 0.4686254560947418,
"adv/mean_abs_step_conf": 0.7495909333229065,
"adv/ratio_final_to_reasoning": 1.4100469326102816,
"adv/ratio_step_to_reasoning": 1.5995523153385034,
"adv/std_final_conf": 0.8423540592193604,
"adv/std_reasoning": 0.7207648754119873,
"adv/std_step_conf": 0.9355702996253967,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7115801933320183,
"calib/avg_num_step_conf": 5.4609375,
"calib/ece": 0.25689516129032264,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.532258064516129,
"calib/gap": 0.3188104162556717,
"calib/mean_conf": 0.6358467741935484,
"calib/mu_c": 0.7785401459854014,
"calib/mu_w": 0.45972972972972975,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.17016129032258068,
"calib/std_conf": 0.43216177422638546,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5004310344827586,
"calib/step_q_c_n": 696.0,
"calib/step_q_gap": 0.1279524020041261,
"calib/step_q_w": 0.3724786324786325,
"calib/step_q_w_n": 702.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2804.0,
"completions/max_terminated_length": 2804.0,
"completions/mean_length": 508.15234375,
"completions/mean_terminated_length": 508.15234375,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.034315966069698334,
"kl": 0.099456787109375,
"learning_rate": 8.611111111111112e-07,
"loss": 0.0557,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.034047387540340424,
"mask/share_reasoning": 0.8473041653633118,
"mask/share_step_conf": 0.11864843964576721,
"num_tokens": 39365023.0,
"reward": 1.1078245639801025,
"reward_std": 0.23867198824882507,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6902972459793091,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.817422091960907,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.6348492503166199,
"adv/mean_abs_reasoning": 0.44265952706336975,
"adv/mean_abs_step_conf": 0.7500869631767273,
"adv/ratio_final_to_reasoning": 1.434170534017981,
"adv/ratio_step_to_reasoning": 1.694500891357405,
"adv/std_final_conf": 0.845905065536499,
"adv/std_reasoning": 0.7205403447151184,
"adv/std_step_conf": 0.935236394405365,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7798478089740225,
"calib/avg_num_step_conf": 5.671875,
"calib/ece": 0.22203187250996018,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5896414342629482,
"calib/gap": 0.41587181842036225,
"calib/mean_conf": 0.6696812749003984,
"calib/mu_c": 0.8403378378378379,
"calib/mu_w": 0.42446601941747564,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.15103585657370522,
"calib/std_conf": 0.42753613813375685,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4838546255506608,
"calib/step_q_c_n": 908.0,
"calib/step_q_gap": 0.12379947849183726,
"calib/step_q_w": 0.3600551470588235,
"calib/step_q_w_n": 544.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2053.0,
"completions/max_terminated_length": 2053.0,
"completions/mean_length": 499.71875,
"completions/mean_terminated_length": 499.71875,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.05624116212129593,
"kl": 0.09752655029296875,
"learning_rate": 8.333333333333333e-07,
"loss": -0.0172,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.032660022377967834,
"mask/share_reasoning": 0.8419471979141235,
"mask/share_step_conf": 0.12539277970790863,
"num_tokens": 39597103.0,
"reward": 1.164198637008667,
"reward_std": 0.22839687764644623,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7475042939186096,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8471578359603882,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.7149491906166077,
"adv/mean_abs_reasoning": 0.5040189623832703,
"adv/mean_abs_step_conf": 0.7576113343238831,
"adv/ratio_final_to_reasoning": 1.4184966121828966,
"adv/ratio_step_to_reasoning": 1.503140538089069,
"adv/std_final_conf": 0.8838882446289062,
"adv/std_reasoning": 0.7576205730438232,
"adv/std_step_conf": 0.9358006715774536,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6920087064676617,
"calib/avg_num_step_conf": 5.5,
"calib/ece": 0.29799212598425195,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.484251968503937,
"calib/gap": 0.2762300995024875,
"calib/mean_conf": 0.5831889763779528,
"calib/mu_c": 0.7289166666666667,
"calib/mu_w": 0.45268656716417915,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2043700787401575,
"calib/std_conf": 0.4422914425957622,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.48197406340057636,
"calib/step_q_c_n": 694.0,
"calib/step_q_gap": 0.10611972166388167,
"calib/step_q_w": 0.3758543417366947,
"calib/step_q_w_n": 714.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1359.0,
"completions/max_terminated_length": 1359.0,
"completions/mean_length": 465.953125,
"completions/mean_terminated_length": 467.7804260253906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.1824,
"grad_norm": 0.044052887707948685,
"kl": 0.1040191650390625,
"learning_rate": 8.055555555555557e-07,
"loss": -0.0613,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0346250906586647,
"mask/share_reasoning": 0.8380456566810608,
"mask/share_step_conf": 0.1234230250120163,
"num_tokens": 39823283.0,
"reward": 1.1075711250305176,
"reward_std": 0.23493888974189758,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.6674386858940125,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.837531566619873,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.6492525339126587,
"adv/mean_abs_reasoning": 0.5081915855407715,
"adv/mean_abs_step_conf": 0.7804287672042847,
"adv/ratio_final_to_reasoning": 1.2775743487011555,
"adv/ratio_step_to_reasoning": 1.5356979324516422,
"adv/std_final_conf": 0.8524985313415527,
"adv/std_reasoning": 0.757496178150177,
"adv/std_step_conf": 0.934755802154541,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7525674786043448,
"calib/avg_num_step_conf": 5.546875,
"calib/ece": 0.2151778656126483,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5731225296442688,
"calib/gap": 0.3166366030283081,
"calib/mean_conf": 0.7010276679841898,
"calib/mu_c": 0.8236774193548387,
"calib/mu_w": 0.5070408163265306,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15177865612648225,
"calib/std_conf": 0.38632622970259445,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4635011709601874,
"calib/step_q_c_n": 854.0,
"calib/step_q_gap": 0.0510630084160178,
"calib/step_q_w": 0.4124381625441696,
"calib/step_q_w_n": 566.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2372.0,
"completions/max_terminated_length": 2372.0,
"completions/mean_length": 474.51953125,
"completions/mean_terminated_length": 474.51953125,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.04483543336391449,
"kl": 0.1094207763671875,
"learning_rate": 7.777777777777779e-07,
"loss": 0.061,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.035601723939180374,
"mask/share_reasoning": 0.8355083465576172,
"mask/share_step_conf": 0.12888994812965393,
"num_tokens": 40048112.0,
"reward": 1.1799538135528564,
"reward_std": 0.2019103467464447,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7470546960830688,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8627352714538574,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.7111342549324036,
"adv/mean_abs_reasoning": 0.5352945327758789,
"adv/mean_abs_step_conf": 0.7427831292152405,
"adv/ratio_final_to_reasoning": 1.328491533893821,
"adv/ratio_step_to_reasoning": 1.3876157586803421,
"adv/std_final_conf": 0.8797129988670349,
"adv/std_reasoning": 0.792944610118866,
"adv/std_step_conf": 0.9360061883926392,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6726320673427455,
"calib/avg_num_step_conf": 6.3203125,
"calib/ece": 0.29532258064516126,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6612903225806451,
"calib/gap": 0.2637847153178233,
"calib/mean_conf": 0.7501612903225807,
"calib/mu_c": 0.8639716312056738,
"calib/mu_w": 0.6001869158878504,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.23846774193548384,
"calib/std_conf": 0.3867940684715662,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4869066366704162,
"calib/step_q_c_n": 889.0,
"calib/step_q_gap": 0.06949922926300889,
"calib/step_q_w": 0.41740740740740734,
"calib/step_q_w_n": 729.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2397.0,
"completions/max_terminated_length": 2397.0,
"completions/mean_length": 525.81640625,
"completions/mean_terminated_length": 525.81640625,
"completions/min_length": 133.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.03908439353108406,
"kl": 0.095733642578125,
"learning_rate": 7.5e-07,
"loss": 0.0406,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.0347096249461174,
"mask/share_reasoning": 0.8326051235198975,
"mask/share_step_conf": 0.13268522918224335,
"num_tokens": 40285881.0,
"reward": 1.0948352813720703,
"reward_std": 0.27466046810150146,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6795969009399414,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8035906553268433,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.7004639506340027,
"adv/mean_abs_reasoning": 0.5624732375144958,
"adv/mean_abs_step_conf": 0.7741360664367676,
"adv/ratio_final_to_reasoning": 1.245328495501887,
"adv/ratio_step_to_reasoning": 1.3763073774986794,
"adv/std_final_conf": 0.8960103988647461,
"adv/std_reasoning": 0.7756208777427673,
"adv/std_step_conf": 0.9355176687240601,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6280317004850722,
"calib/avg_num_step_conf": 5.9140625,
"calib/ece": 0.29665289256198346,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.4834710743801653,
"calib/gap": 0.22471476395436202,
"calib/mean_conf": 0.6067355371900827,
"calib/mu_c": 0.7172357723577234,
"calib/mu_w": 0.4925210084033614,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.19756198347107434,
"calib/std_conf": 0.4254473633542015,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.44660598179453836,
"calib/step_q_c_n": 769.0,
"calib/step_q_gap": 0.04566638447910215,
"calib/step_q_w": 0.4009395973154362,
"calib/step_q_w_n": 745.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2512.0,
"completions/max_terminated_length": 2512.0,
"completions/mean_length": 543.96484375,
"completions/mean_terminated_length": 548.248046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.1856,
"grad_norm": 0.03411554917693138,
"kl": 0.09084320068359375,
"learning_rate": 7.222222222222222e-07,
"loss": -0.0468,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.03144906461238861,
"mask/share_reasoning": 0.8394503593444824,
"mask/share_step_conf": 0.12128806114196777,
"num_tokens": 40529368.0,
"reward": 1.045793056488037,
"reward_std": 0.24938157200813293,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.6349597573280334,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.7815009355545044,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.717574417591095,
"adv/mean_abs_reasoning": 0.6078211665153503,
"adv/mean_abs_step_conf": 0.7437684535980225,
"adv/ratio_final_to_reasoning": 1.1805683268731195,
"adv/ratio_step_to_reasoning": 1.2236632986344658,
"adv/std_final_conf": 0.9007578492164612,
"adv/std_reasoning": 0.826562762260437,
"adv/std_step_conf": 0.9361304044723511,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7264182499331013,
"calib/avg_num_step_conf": 6.4296875,
"calib/ece": 0.26903614457831326,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.46586345381526106,
"calib/gap": 0.37825929890286336,
"calib/mean_conf": 0.5473493975903615,
"calib/mu_c": 0.7721782178217823,
"calib/mu_w": 0.39391891891891895,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2053815261044177,
"calib/std_conf": 0.4547935256945654,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.49954861111111115,
"calib/step_q_c_n": 576.0,
"calib/step_q_gap": 0.15431496625129804,
"calib/step_q_w": 0.3452336448598131,
"calib/step_q_w_n": 1070.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2487.0,
"completions/max_terminated_length": 2487.0,
"completions/mean_length": 544.6328125,
"completions/mean_terminated_length": 546.7686767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.03185137361288071,
"kl": 0.0924072265625,
"learning_rate": 6.944444444444446e-07,
"loss": -0.0444,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03254655748605728,
"mask/share_reasoning": 0.8343169689178467,
"mask/share_step_conf": 0.12923020124435425,
"num_tokens": 40774618.0,
"reward": 1.0986486673355103,
"reward_std": 0.2614133954048157,
"rewards/accuracy_reward_step": 0.3984375,
"rewards/final_brier_reward_step": 0.6948410272598267,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8188250064849854,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.7038910984992981,
"adv/mean_abs_reasoning": 0.5159138441085815,
"adv/mean_abs_step_conf": 0.7356275916099548,
"adv/ratio_final_to_reasoning": 1.3643578410180326,
"adv/ratio_step_to_reasoning": 1.4258729437295956,
"adv/std_final_conf": 0.8717449307441711,
"adv/std_reasoning": 0.757626473903656,
"adv/std_step_conf": 0.935823917388916,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7927789514115613,
"calib/avg_num_step_conf": 5.59375,
"calib/ece": 0.23728000000000007,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.552,
"calib/gap": 0.44018052621471115,
"calib/mean_conf": 0.6571199999999999,
"calib/mu_c": 0.8807317073170733,
"calib/mu_w": 0.44055118110236213,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.20120000000000007,
"calib/std_conf": 0.42556844995840565,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5200438596491228,
"calib/step_q_c_n": 684.0,
"calib/step_q_gap": 0.1446427901304062,
"calib/step_q_w": 0.3754010695187166,
"calib/step_q_w_n": 748.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2042.0,
"completions/max_terminated_length": 2042.0,
"completions/mean_length": 481.30859375,
"completions/mean_terminated_length": 483.19610595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.04285878688097,
"kl": 0.10150909423828125,
"learning_rate": 6.666666666666667e-07,
"loss": -0.0869,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03696080669760704,
"mask/share_reasoning": 0.8263058066368103,
"mask/share_step_conf": 0.13282713294029236,
"num_tokens": 41001897.0,
"reward": 1.1082634925842285,
"reward_std": 0.2980533540248871,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.7168636322021484,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8091504573822021,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.7086734771728516,
"adv/mean_abs_reasoning": 0.44564753770828247,
"adv/mean_abs_step_conf": 0.7541524767875671,
"adv/ratio_final_to_reasoning": 1.590210687165838,
"adv/ratio_step_to_reasoning": 1.6922621869869494,
"adv/std_final_conf": 0.8821295499801636,
"adv/std_reasoning": 0.7014589309692383,
"adv/std_step_conf": 0.9357683658599854,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7694280078895464,
"calib/avg_num_step_conf": 5.73828125,
"calib/ece": 0.22299595141700407,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.48582995951417,
"calib/gap": 0.4125726495726495,
"calib/mean_conf": 0.5868016194331984,
"calib/mu_c": 0.7822307692307692,
"calib/mu_w": 0.3696581196581197,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.14174089068825912,
"calib/std_conf": 0.437853492955034,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5092661870503596,
"calib/step_q_c_n": 695.0,
"calib/step_q_gap": 0.12377523097800819,
"calib/step_q_w": 0.38549095607235145,
"calib/step_q_w_n": 774.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2223.0,
"completions/max_terminated_length": 2223.0,
"completions/mean_length": 506.140625,
"completions/mean_terminated_length": 506.140625,
"completions/min_length": 133.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.1888,
"grad_norm": 0.03434273600578308,
"kl": 0.09429168701171875,
"learning_rate": 6.388888888888889e-07,
"loss": -0.0578,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.033864229917526245,
"mask/share_reasoning": 0.8405582904815674,
"mask/share_step_conf": 0.12557752430438995,
"num_tokens": 41235301.0,
"reward": 1.1220314502716064,
"reward_std": 0.24766887724399567,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.73037189245224,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8127731084823608,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.6909327507019043,
"adv/mean_abs_reasoning": 0.5780574083328247,
"adv/mean_abs_step_conf": 0.7503166794776917,
"adv/ratio_final_to_reasoning": 1.1952666651131127,
"adv/ratio_step_to_reasoning": 1.297996822913627,
"adv/std_final_conf": 0.88179612159729,
"adv/std_reasoning": 0.7929460406303406,
"adv/std_step_conf": 0.9349659085273743,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.796986493374108,
"calib/avg_num_step_conf": 5.609375,
"calib/ece": 0.16919631093544138,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.4782608695652174,
"calib/gap": 0.47001868841318395,
"calib/mean_conf": 0.6095388669301712,
"calib/mu_c": 0.8120370370370371,
"calib/mu_w": 0.34201834862385316,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.10478260869565219,
"calib/std_conf": 0.42505284789110764,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.517627345844504,
"calib/step_q_c_n": 746.0,
"calib/step_q_gap": 0.1893374907720402,
"calib/step_q_w": 0.3282898550724638,
"calib/step_q_w_n": 690.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2598.0,
"completions/max_terminated_length": 2598.0,
"completions/mean_length": 480.34375,
"completions/mean_terminated_length": 480.34375,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.03665749356150627,
"kl": 0.09930419921875,
"learning_rate": 6.111111111111112e-07,
"loss": 0.0778,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03533283621072769,
"mask/share_reasoning": 0.8403965830802917,
"mask/share_step_conf": 0.12427057325839996,
"num_tokens": 41464341.0,
"reward": 1.1939667463302612,
"reward_std": 0.2174018770456314,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7896803021430969,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8587312698364258,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.7072902917861938,
"adv/mean_abs_reasoning": 0.6448341608047485,
"adv/mean_abs_step_conf": 0.7562251687049866,
"adv/ratio_final_to_reasoning": 1.096856113986735,
"adv/ratio_step_to_reasoning": 1.1727436520441517,
"adv/std_final_conf": 0.8755064010620117,
"adv/std_reasoning": 0.858925461769104,
"adv/std_step_conf": 0.9356964826583862,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.752932788904012,
"calib/avg_num_step_conf": 6.18359375,
"calib/ece": 0.23392000000000007,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.452,
"calib/gap": 0.3749381035711972,
"calib/mean_conf": 0.55288,
"calib/mu_c": 0.7193525179856116,
"calib/mu_w": 0.3444144144144144,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11540000000000006,
"calib/std_conf": 0.4377493639058771,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4893333333333333,
"calib/step_q_c_n": 795.0,
"calib/step_q_gap": 0.09919373942470378,
"calib/step_q_w": 0.3901395939086295,
"calib/step_q_w_n": 788.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1992.0,
"completions/max_terminated_length": 1992.0,
"completions/mean_length": 508.3984375,
"completions/mean_terminated_length": 508.3984375,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.051263369619846344,
"kl": 0.1013641357421875,
"learning_rate": 5.833333333333334e-07,
"loss": 0.0806,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03382003307342529,
"mask/share_reasoning": 0.8366349339485168,
"mask/share_step_conf": 0.12954503297805786,
"num_tokens": 41700755.0,
"reward": 1.1520042419433594,
"reward_std": 0.22504480183124542,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7291203141212463,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8473212718963623,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.6792027950286865,
"adv/mean_abs_reasoning": 0.5286693572998047,
"adv/mean_abs_step_conf": 0.7685763239860535,
"adv/ratio_final_to_reasoning": 1.2847402363127989,
"adv/ratio_step_to_reasoning": 1.4537939704158023,
"adv/std_final_conf": 0.862787127494812,
"adv/std_reasoning": 0.7755132913589478,
"adv/std_step_conf": 0.9360314607620239,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7416482655459791,
"calib/avg_num_step_conf": 6.01953125,
"calib/ece": 0.2315416666666667,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.525,
"calib/gap": 0.3361200940237909,
"calib/mean_conf": 0.6391249999999999,
"calib/mu_c": 0.7805755395683454,
"calib/mu_w": 0.44445544554455446,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.14575000000000005,
"calib/std_conf": 0.4145555664905892,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.47737545565006073,
"calib/step_q_c_n": 823.0,
"calib/step_q_gap": 0.11126124952192701,
"calib/step_q_w": 0.3661142061281337,
"calib/step_q_w_n": 718.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2819.0,
"completions/max_terminated_length": 2819.0,
"completions/mean_length": 561.98828125,
"completions/mean_terminated_length": 564.1921997070312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.192,
"grad_norm": 0.034965962171554565,
"kl": 0.09717559814453125,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0278,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.030879775062203407,
"mask/share_reasoning": 0.8455160856246948,
"mask/share_step_conf": 0.11969786882400513,
"num_tokens": 41948480.0,
"reward": 1.082472801208496,
"reward_std": 0.2739405035972595,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6942156553268433,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.7830908298492432,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.6977521181106567,
"adv/mean_abs_reasoning": 0.5350258350372314,
"adv/mean_abs_step_conf": 0.7366287708282471,
"adv/ratio_final_to_reasoning": 1.3041465895980546,
"adv/ratio_step_to_reasoning": 1.3768097213043673,
"adv/std_final_conf": 0.8979544043540955,
"adv/std_reasoning": 0.7927014231681824,
"adv/std_step_conf": 0.9357931613922119,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8018730158730158,
"calib/avg_num_step_conf": 5.30859375,
"calib/ece": 0.19780876494023902,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.42231075697211157,
"calib/gap": 0.4372215873015873,
"calib/mean_conf": 0.5604382470119521,
"calib/mu_c": 0.77992,
"calib/mu_w": 0.34269841269841267,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.13011952191235057,
"calib/std_conf": 0.4297606662437984,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5105908419497784,
"calib/step_q_c_n": 677.0,
"calib/step_q_gap": 0.13949113520491035,
"calib/step_q_w": 0.3710997067448681,
"calib/step_q_w_n": 682.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2587.0,
"completions/max_terminated_length": 2587.0,
"completions/mean_length": 468.33203125,
"completions/mean_terminated_length": 468.33203125,
"completions/min_length": 168.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.031503282487392426,
"kl": 0.1115264892578125,
"learning_rate": 5.277777777777779e-07,
"loss": 0.0721,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.035831812769174576,
"mask/share_reasoning": 0.8384501934051514,
"mask/share_step_conf": 0.12571796774864197,
"num_tokens": 42174637.0,
"reward": 1.1525561809539795,
"reward_std": 0.23784999549388885,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7531113624572754,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8393547534942627,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.6776944398880005,
"adv/mean_abs_reasoning": 0.47964245080947876,
"adv/mean_abs_step_conf": 0.7465130686759949,
"adv/ratio_final_to_reasoning": 1.4129158892092957,
"adv/ratio_step_to_reasoning": 1.556394909199814,
"adv/std_final_conf": 0.8647273182868958,
"adv/std_reasoning": 0.7575068473815918,
"adv/std_step_conf": 0.9357609748840332,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8048875855327469,
"calib/avg_num_step_conf": 5.62890625,
"calib/ece": 0.15850393700787402,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.547244094488189,
"calib/gap": 0.4334688823721082,
"calib/mean_conf": 0.6879527559055119,
"calib/mu_c": 0.8569032258064516,
"calib/mu_w": 0.42343434343434344,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.11811023622047245,
"calib/std_conf": 0.3957447650037054,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5147479484173506,
"calib/step_q_c_n": 853.0,
"calib/step_q_gap": 0.1278431865125887,
"calib/step_q_w": 0.3869047619047619,
"calib/step_q_w_n": 588.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 479.85546875,
"completions/mean_terminated_length": 481.7372741699219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.044035859405994415,
"kl": 0.09618377685546875,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0291,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03356048837304115,
"mask/share_reasoning": 0.8339300751686096,
"mask/share_step_conf": 0.12860319018363953,
"num_tokens": 42403640.0,
"reward": 1.1931354999542236,
"reward_std": 0.2289653867483139,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7957609295845032,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.847839891910553,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.6630797982215881,
"adv/mean_abs_reasoning": 0.5739421844482422,
"adv/mean_abs_step_conf": 0.7572264671325684,
"adv/ratio_final_to_reasoning": 1.1553076532596016,
"adv/ratio_step_to_reasoning": 1.319342762477942,
"adv/std_final_conf": 0.8448971509933472,
"adv/std_reasoning": 0.7929164171218872,
"adv/std_step_conf": 0.9354925155639648,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7529853620955316,
"calib/avg_num_step_conf": 5.40234375,
"calib/ece": 0.25356,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.452,
"calib/gap": 0.3791692347200822,
"calib/mean_conf": 0.5473199999999999,
"calib/mu_c": 0.7262878787878788,
"calib/mu_w": 0.34711864406779663,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.13643999999999998,
"calib/std_conf": 0.44979953045773624,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4964082278481013,
"calib/step_q_c_n": 632.0,
"calib/step_q_gap": 0.12648812132346748,
"calib/step_q_w": 0.3699201065246338,
"calib/step_q_w_n": 751.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2576.0,
"completions/max_terminated_length": 2576.0,
"completions/mean_length": 525.8515625,
"completions/mean_terminated_length": 527.9137573242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.1952,
"grad_norm": 0.04003230854868889,
"kl": 0.08823394775390625,
"learning_rate": 4.7222222222222226e-07,
"loss": 0.01,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03344167023897171,
"mask/share_reasoning": 0.8502011299133301,
"mask/share_step_conf": 0.11245097219944,
"num_tokens": 42644938.0,
"reward": 1.126890778541565,
"reward_std": 0.2072310894727707,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7198058366775513,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8236920833587646,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.7117974758148193,
"adv/mean_abs_reasoning": 0.5680974125862122,
"adv/mean_abs_step_conf": 0.7626612186431885,
"adv/ratio_final_to_reasoning": 1.2529496879319089,
"adv/ratio_step_to_reasoning": 1.342483175854018,
"adv/std_final_conf": 0.8946519494056702,
"adv/std_reasoning": 0.7928260564804077,
"adv/std_step_conf": 0.9355891942977905,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7530124514660597,
"calib/avg_num_step_conf": 6.2421875,
"calib/ece": 0.21844621513944223,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6215139442231076,
"calib/gap": 0.378046592582675,
"calib/mean_conf": 0.7079282868525898,
"calib/mu_c": 0.8540259740259739,
"calib/mu_w": 0.47597938144329893,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15641434262948206,
"calib/std_conf": 0.4043780464107534,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4989277652370203,
"calib/step_q_c_n": 886.0,
"calib/step_q_gap": 0.1237592259111776,
"calib/step_q_w": 0.3751685393258427,
"calib/step_q_w_n": 712.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2881.0,
"completions/max_terminated_length": 2881.0,
"completions/mean_length": 525.46484375,
"completions/mean_terminated_length": 525.46484375,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.03780568763613701,
"kl": 0.09539031982421875,
"learning_rate": 4.444444444444445e-07,
"loss": 0.107,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03317577391862869,
"mask/share_reasoning": 0.8409853577613831,
"mask/share_step_conf": 0.12583887577056885,
"num_tokens": 42884737.0,
"reward": 1.1625230312347412,
"reward_std": 0.257118821144104,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7507980465888977,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8390820026397705,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.6712584495544434,
"adv/mean_abs_reasoning": 0.48389190435409546,
"adv/mean_abs_step_conf": 0.7558167576789856,
"adv/ratio_final_to_reasoning": 1.3872074393359544,
"adv/ratio_step_to_reasoning": 1.5619537150303404,
"adv/std_final_conf": 0.8619060516357422,
"adv/std_reasoning": 0.7576537132263184,
"adv/std_step_conf": 0.9357088208198547,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.8066108387799564,
"calib/avg_num_step_conf": 6.15625,
"calib/ece": 0.22610655737704916,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.5983606557377049,
"calib/gap": 0.42056372549019605,
"calib/mean_conf": 0.6977459016393442,
"calib/mu_c": 0.8838970588235294,
"calib/mu_w": 0.4633333333333334,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.18323770491803276,
"calib/std_conf": 0.40918681976184174,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5336063408190225,
"calib/step_q_c_n": 757.0,
"calib/step_q_gap": 0.19398485119753284,
"calib/step_q_w": 0.33962148962148964,
"calib/step_q_w_n": 819.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2703.0,
"completions/max_terminated_length": 2703.0,
"completions/mean_length": 541.83984375,
"completions/mean_terminated_length": 541.83984375,
"completions/min_length": 198.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.05819058418273926,
"kl": 0.0870513916015625,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.1115,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03250384330749512,
"mask/share_reasoning": 0.8443625569343567,
"mask/share_step_conf": 0.1231335997581482,
"num_tokens": 43130368.0,
"reward": 1.122775673866272,
"reward_std": 0.2509518265724182,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7374019622802734,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8075161576271057,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.6365198493003845,
"adv/mean_abs_reasoning": 0.5316267609596252,
"adv/mean_abs_step_conf": 0.7644907236099243,
"adv/ratio_final_to_reasoning": 1.1973058845860574,
"adv/ratio_step_to_reasoning": 1.4380215213958805,
"adv/std_final_conf": 0.8457080721855164,
"adv/std_reasoning": 0.7753575444221497,
"adv/std_step_conf": 0.9350101947784424,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7555590062111801,
"calib/avg_num_step_conf": 6.046875,
"calib/ece": 0.24968627450980385,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5411764705882353,
"calib/gap": 0.3616521739130435,
"calib/mean_conf": 0.6489019607843138,
"calib/mu_c": 0.812,
"calib/mu_w": 0.45034782608695656,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1747843137254901,
"calib/std_conf": 0.433074247807235,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.48518159806295397,
"calib/step_q_c_n": 826.0,
"calib/step_q_gap": 0.08296553158095954,
"calib/step_q_w": 0.4022160664819944,
"calib/step_q_w_n": 722.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1372.0,
"completions/max_terminated_length": 1372.0,
"completions/mean_length": 480.0390625,
"completions/mean_terminated_length": 481.9216003417969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.1984,
"grad_norm": 0.027318695560097694,
"kl": 0.101898193359375,
"learning_rate": 3.8888888888888895e-07,
"loss": 0.0403,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03359612450003624,
"mask/share_reasoning": 0.83244788646698,
"mask/share_step_conf": 0.1300496906042099,
"num_tokens": 43358298.0,
"reward": 1.1724010705947876,
"reward_std": 0.20041513442993164,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7310941219329834,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8700760006904602,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.7441973686218262,
"adv/mean_abs_reasoning": 0.664376974105835,
"adv/mean_abs_step_conf": 0.7474873661994934,
"adv/ratio_final_to_reasoning": 1.1201432283582962,
"adv/ratio_step_to_reasoning": 1.1250952325756236,
"adv/std_final_conf": 0.8918111324310303,
"adv/std_reasoning": 0.8590844869613647,
"adv/std_step_conf": 0.9357367157936096,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6873687664041995,
"calib/avg_num_step_conf": 6.9375,
"calib/ece": 0.2960728744939271,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.4939271255060729,
"calib/gap": 0.280001312335958,
"calib/mean_conf": 0.6168016194331983,
"calib/mu_c": 0.7528346456692913,
"calib/mu_w": 0.47283333333333327,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.19935222672064778,
"calib/std_conf": 0.4320152066568983,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.4732522407170295,
"calib/step_q_c_n": 781.0,
"calib/step_q_gap": 0.1257517382044666,
"calib/step_q_w": 0.34750050251256287,
"calib/step_q_w_n": 995.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2179.0,
"completions/max_terminated_length": 2179.0,
"completions/mean_length": 575.44140625,
"completions/mean_terminated_length": 575.44140625,
"completions/min_length": 173.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.03541375696659088,
"kl": 0.0881195068359375,
"learning_rate": 3.611111111111111e-07,
"loss": -0.0178,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.030242277309298515,
"mask/share_reasoning": 0.8467020988464355,
"mask/share_step_conf": 0.12305556237697601,
"num_tokens": 43607155.0,
"reward": 1.0910013914108276,
"reward_std": 0.30093953013420105,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6646523475646973,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8172961473464966,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.6986135244369507,
"adv/mean_abs_reasoning": 0.5794321298599243,
"adv/mean_abs_step_conf": 0.7691971063613892,
"adv/ratio_final_to_reasoning": 1.205686547975581,
"adv/ratio_step_to_reasoning": 1.3275016463918559,
"adv/std_final_conf": 0.8814206719398499,
"adv/std_reasoning": 0.7929037809371948,
"adv/std_step_conf": 0.9354791641235352,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7335718932986337,
"calib/avg_num_step_conf": 6.03125,
"calib/ece": 0.24537848605577695,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.601593625498008,
"calib/gap": 0.3321600520494469,
"calib/mean_conf": 0.7028286852589642,
"calib/mu_c": 0.843103448275862,
"calib/mu_w": 0.5109433962264152,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18525896414342635,
"calib/std_conf": 0.40743435730411504,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.52737922705314,
"calib/step_q_c_n": 828.0,
"calib/step_q_gap": 0.12147140582408972,
"calib/step_q_w": 0.4059078212290503,
"calib/step_q_w_n": 716.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2588.0,
"completions/max_terminated_length": 2588.0,
"completions/mean_length": 523.77734375,
"completions/mean_terminated_length": 525.8314208984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.03188059478998184,
"kl": 0.08913421630859375,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0705,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034026190638542175,
"mask/share_reasoning": 0.8330436944961548,
"mask/share_step_conf": 0.12902390956878662,
"num_tokens": 43845314.0,
"reward": 1.1429827213287354,
"reward_std": 0.2559507489204407,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7220597863197327,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8363538384437561,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.6727781295776367,
"adv/mean_abs_reasoning": 0.44140952825546265,
"adv/mean_abs_step_conf": 0.7335576415061951,
"adv/ratio_final_to_reasoning": 1.5241586021864737,
"adv/ratio_step_to_reasoning": 1.6618527570199022,
"adv/std_final_conf": 0.8597487807273865,
"adv/std_reasoning": 0.7014789581298828,
"adv/std_step_conf": 0.9356318116188049,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7398478835978837,
"calib/avg_num_step_conf": 5.55859375,
"calib/ece": 0.24214574898785426,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.4331983805668016,
"calib/gap": 0.3963564814814815,
"calib/mean_conf": 0.5297570850202429,
"calib/mu_c": 0.7094814814814815,
"calib/mu_w": 0.313125,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.11267206477732798,
"calib/std_conf": 0.4506593703872346,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5149048316251831,
"calib/step_q_c_n": 683.0,
"calib/step_q_gap": 0.2036345613549128,
"calib/step_q_w": 0.31127027027027027,
"calib/step_q_w_n": 740.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2320.0,
"completions/max_terminated_length": 2320.0,
"completions/mean_length": 478.1875,
"completions/mean_terminated_length": 483.85772705078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 89.0,
"epoch": 0.2016,
"grad_norm": 0.03390476480126381,
"kl": 0.1073455810546875,
"learning_rate": 3.055555555555556e-07,
"loss": -0.0692,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03491278365254402,
"mask/share_reasoning": 0.8374266624450684,
"mask/share_step_conf": 0.11594181507825851,
"num_tokens": 44075498.0,
"reward": 1.1270095109939575,
"reward_std": 0.2504253387451172,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7151448726654053,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8274785280227661,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.6990896463394165,
"adv/mean_abs_reasoning": 0.48381108045578003,
"adv/mean_abs_step_conf": 0.7443450093269348,
"adv/ratio_final_to_reasoning": 1.4449641080581097,
"adv/ratio_step_to_reasoning": 1.5385034353196616,
"adv/std_final_conf": 0.8977615237236023,
"adv/std_reasoning": 0.7576168179512024,
"adv/std_step_conf": 0.9358354210853577,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7738014626218852,
"calib/avg_num_step_conf": 6.3203125,
"calib/ece": 0.19955284552845526,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.540650406504065,
"calib/gap": 0.41882448537378114,
"calib/mean_conf": 0.6448373983739837,
"calib/mu_c": 0.8219014084507043,
"calib/mu_w": 0.4030769230769231,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1335772357723577,
"calib/std_conf": 0.4239618765263017,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4915936739659368,
"calib/step_q_c_n": 822.0,
"calib/step_q_gap": 0.14889266894081116,
"calib/step_q_w": 0.3427010050251256,
"calib/step_q_w_n": 796.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3054.0,
"completions/max_terminated_length": 3054.0,
"completions/mean_length": 564.60546875,
"completions/mean_terminated_length": 571.3004150390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.04439045488834381,
"kl": 0.08817291259765625,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.0007,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.029904117807745934,
"mask/share_reasoning": 0.8434927463531494,
"mask/share_step_conf": 0.11488443613052368,
"num_tokens": 44325645.0,
"reward": 1.1512739658355713,
"reward_std": 0.2505730092525482,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7457519769668579,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8352595567703247,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.7039273977279663,
"adv/mean_abs_reasoning": 0.5166229605674744,
"adv/mean_abs_step_conf": 0.7615114450454712,
"adv/ratio_final_to_reasoning": 1.3625553865332487,
"adv/ratio_step_to_reasoning": 1.4740178102208308,
"adv/std_final_conf": 0.8850259184837341,
"adv/std_reasoning": 0.7576173543930054,
"adv/std_step_conf": 0.9355537295341492,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7208251721741328,
"calib/avg_num_step_conf": 6.55859375,
"calib/ece": 0.2646428571428571,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6190476190476191,
"calib/gap": 0.36280849181777985,
"calib/mean_conf": 0.7055158730158729,
"calib/mu_c": 0.876842105263158,
"calib/mu_w": 0.5140336134453781,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2211904761904762,
"calib/std_conf": 0.41151690420499143,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5186729857819905,
"calib/step_q_c_n": 844.0,
"calib/step_q_gap": 0.13002627919516418,
"calib/step_q_w": 0.3886467065868263,
"calib/step_q_w_n": 835.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1930.0,
"completions/max_terminated_length": 1930.0,
"completions/mean_length": 504.484375,
"completions/mean_terminated_length": 504.484375,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.03353596478700638,
"kl": 0.094268798828125,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.0216,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03465589880943298,
"mask/share_reasoning": 0.8255807161331177,
"mask/share_step_conf": 0.13976339995861053,
"num_tokens": 44558961.0,
"reward": 1.1210756301879883,
"reward_std": 0.23884853720664978,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.714613676071167,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.818879246711731,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.6482102274894714,
"adv/mean_abs_reasoning": 0.49863550066947937,
"adv/mean_abs_step_conf": 0.7356137037277222,
"adv/ratio_final_to_reasoning": 1.2999680660907007,
"adv/ratio_step_to_reasoning": 1.4752533719321437,
"adv/std_final_conf": 0.8706212043762207,
"adv/std_reasoning": 0.75757896900177,
"adv/std_step_conf": 0.9354583024978638,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.832940251572327,
"calib/avg_num_step_conf": 5.59765625,
"calib/ece": 0.16419999999999998,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.552,
"calib/gap": 0.530704926624738,
"calib/mean_conf": 0.65012,
"calib/mu_c": 0.8751388888888889,
"calib/mu_w": 0.3444339622641509,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.11916,
"calib/std_conf": 0.4264837459974295,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.48260172626387177,
"calib/step_q_c_n": 811.0,
"calib/step_q_gap": 0.10440236935068847,
"calib/step_q_w": 0.3781993569131833,
"calib/step_q_w_n": 622.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2721.0,
"completions/max_terminated_length": 2721.0,
"completions/mean_length": 507.06640625,
"completions/mean_terminated_length": 509.054931640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.2048,
"grad_norm": 0.061243936419487,
"kl": 0.10663604736328125,
"learning_rate": 2.2222222222222224e-07,
"loss": 0.0477,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.035767100751399994,
"mask/share_reasoning": 0.837902307510376,
"mask/share_step_conf": 0.12242428958415985,
"num_tokens": 44793746.0,
"reward": 1.1880981922149658,
"reward_std": 0.2168246954679489,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.8043121099472046,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8432351350784302,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.7263143062591553,
"adv/mean_abs_reasoning": 0.6314749717712402,
"adv/mean_abs_step_conf": 0.7740864157676697,
"adv/ratio_final_to_reasoning": 1.150187004596394,
"adv/ratio_step_to_reasoning": 1.225838632363235,
"adv/std_final_conf": 0.8906956315040588,
"adv/std_reasoning": 0.8266628384590149,
"adv/std_step_conf": 0.9355929493904114,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7353971232020013,
"calib/avg_num_step_conf": 5.69140625,
"calib/ece": 0.2540316205533597,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.45849802371541504,
"calib/gap": 0.31774108818011254,
"calib/mean_conf": 0.6152173913043478,
"calib/mu_c": 0.7696923076923077,
"calib/mu_w": 0.45195121951219513,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.17770750988142292,
"calib/std_conf": 0.4199242969769833,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.508815592203898,
"calib/step_q_c_n": 667.0,
"calib/step_q_gap": 0.12327128840642965,
"calib/step_q_w": 0.3855443037974684,
"calib/step_q_w_n": 790.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2303.0,
"completions/max_terminated_length": 2303.0,
"completions/mean_length": 504.51171875,
"completions/mean_terminated_length": 504.51171875,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.042873747646808624,
"kl": 0.0930938720703125,
"learning_rate": 1.9444444444444447e-07,
"loss": 0.0569,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.033232901245355606,
"mask/share_reasoning": 0.8482795357704163,
"mask/share_step_conf": 0.11848757416009903,
"num_tokens": 45028613.0,
"reward": 1.1263048648834229,
"reward_std": 0.26204511523246765,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7099542617797852,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8289578557014465,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.6581943035125732,
"adv/mean_abs_reasoning": 0.5350709557533264,
"adv/mean_abs_step_conf": 0.7595569491386414,
"adv/ratio_final_to_reasoning": 1.2301065801373978,
"adv/ratio_step_to_reasoning": 1.4195443444865383,
"adv/std_final_conf": 0.8700836300849915,
"adv/std_reasoning": 0.7928699254989624,
"adv/std_step_conf": 0.9356017708778381,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.801510989010989,
"calib/avg_num_step_conf": 5.4140625,
"calib/ece": 0.19689243027888442,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.49800796812749004,
"calib/gap": 0.42150641025641017,
"calib/mean_conf": 0.6286852589641435,
"calib/mu_c": 0.8033333333333332,
"calib/mu_w": 0.38182692307692306,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.11996015936254979,
"calib/std_conf": 0.42559387209635524,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5106290956749673,
"calib/step_q_c_n": 763.0,
"calib/step_q_gap": 0.13867082922231883,
"calib/step_q_w": 0.37195826645264846,
"calib/step_q_w_n": 623.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2580.0,
"completions/max_terminated_length": 2580.0,
"completions/mean_length": 489.75,
"completions/mean_terminated_length": 491.6706237792969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.031372662633657455,
"kl": 0.09343719482421875,
"learning_rate": 1.6666666666666668e-07,
"loss": 0.0224,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03451238200068474,
"mask/share_reasoning": 0.845749020576477,
"mask/share_step_conf": 0.11583234369754791,
"num_tokens": 45259933.0,
"reward": 1.1746629476547241,
"reward_std": 0.23227962851524353,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7637102007865906,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8497854471206665,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.6965253353118896,
"adv/mean_abs_reasoning": 0.5378446578979492,
"adv/mean_abs_step_conf": 0.7651320695877075,
"adv/ratio_final_to_reasoning": 1.2950306842018473,
"adv/ratio_step_to_reasoning": 1.4225893264015348,
"adv/std_final_conf": 0.8900267481803894,
"adv/std_reasoning": 0.792778491973877,
"adv/std_step_conf": 0.9356444478034973,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7591954022988506,
"calib/avg_num_step_conf": 5.828125,
"calib/ece": 0.2272509960159363,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5099601593625498,
"calib/gap": 0.4037758620689656,
"calib/mean_conf": 0.6113944223107569,
"calib/mu_c": 0.798,
"calib/mu_w": 0.39422413793103445,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15039840637450205,
"calib/std_conf": 0.43467445322633635,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5138539042821159,
"calib/step_q_c_n": 794.0,
"calib/step_q_gap": 0.11080232835088377,
"calib/step_q_w": 0.4030515759312321,
"calib/step_q_w_n": 698.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2257.0,
"completions/max_terminated_length": 2257.0,
"completions/mean_length": 490.859375,
"completions/mean_terminated_length": 494.7243957519531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.208,
"grad_norm": 0.041009191423654556,
"kl": 0.10141754150390625,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.0664,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034289367496967316,
"mask/share_reasoning": 0.8296006917953491,
"mask/share_step_conf": 0.12829747796058655,
"num_tokens": 45491577.0,
"reward": 1.1288546323776245,
"reward_std": 0.25321608781814575,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.735246479511261,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8149751424789429,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.639642596244812,
"adv/mean_abs_reasoning": 0.43357157707214355,
"adv/mean_abs_step_conf": 0.7559478282928467,
"adv/ratio_final_to_reasoning": 1.4752871960939902,
"adv/ratio_step_to_reasoning": 1.7435364038336438,
"adv/std_final_conf": 0.8292726278305054,
"adv/std_reasoning": 0.7013278007507324,
"adv/std_step_conf": 0.9355693459510803,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7500616294835449,
"calib/avg_num_step_conf": 5.3203125,
"calib/ece": 0.22945098039215683,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.611764705882353,
"calib/gap": 0.3932916307161346,
"calib/mean_conf": 0.7307843137254901,
"calib/mu_c": 0.9189473684210526,
"calib/mu_w": 0.525655737704918,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2193333333333333,
"calib/std_conf": 0.3851388501197819,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5326361031518624,
"calib/step_q_c_n": 698.0,
"calib/step_q_gap": 0.08710899471812744,
"calib/step_q_w": 0.44552710843373494,
"calib/step_q_w_n": 664.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1151.0,
"completions/max_terminated_length": 1151.0,
"completions/mean_length": 421.9609375,
"completions/mean_terminated_length": 423.61572265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.03640659153461456,
"kl": 0.1021728515625,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0044,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03759670630097389,
"mask/share_reasoning": 0.827721118927002,
"mask/share_step_conf": 0.13077595829963684,
"num_tokens": 45702143.0,
"reward": 1.1514766216278076,
"reward_std": 0.18845239281654358,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7516941428184509,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.832089364528656,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.7016844749450684,
"adv/mean_abs_reasoning": 0.6250609755516052,
"adv/mean_abs_step_conf": 0.7250959277153015,
"adv/ratio_final_to_reasoning": 1.1225856394663645,
"adv/ratio_step_to_reasoning": 1.1600403097880447,
"adv/std_final_conf": 0.8904477953910828,
"adv/std_reasoning": 0.8589926958084106,
"adv/std_step_conf": 0.9358147382736206,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7472128597355459,
"calib/avg_num_step_conf": 6.12109375,
"calib/ece": 0.24686746987951802,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5261044176706827,
"calib/gap": 0.35282473424941674,
"calib/mean_conf": 0.6451807228915661,
"calib/mu_c": 0.8095488721804512,
"calib/mu_w": 0.4567241379310345,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17895582329317267,
"calib/std_conf": 0.4208769619953476,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4814709677419355,
"calib/step_q_c_n": 775.0,
"calib/step_q_gap": 0.08943813945910717,
"calib/step_q_w": 0.3920328282828283,
"calib/step_q_w_n": 792.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2619.0,
"completions/max_terminated_length": 2619.0,
"completions/mean_length": 518.7578125,
"completions/mean_terminated_length": 518.7578125,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.03738636150956154,
"kl": 0.0984039306640625,
"learning_rate": 8.333333333333334e-08,
"loss": 0.0697,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.032930146902799606,
"mask/share_reasoning": 0.8381119966506958,
"mask/share_step_conf": 0.1289578378200531,
"num_tokens": 45940001.0,
"reward": 1.1338474750518799,
"reward_std": 0.2597096562385559,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7132207155227661,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8373578190803528,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.6680877208709717,
"adv/mean_abs_reasoning": 0.5190901756286621,
"adv/mean_abs_step_conf": 0.7632308006286621,
"adv/ratio_final_to_reasoning": 1.2870359568293908,
"adv/ratio_step_to_reasoning": 1.470324110265283,
"adv/std_final_conf": 0.8786221146583557,
"adv/std_reasoning": 0.7753717303276062,
"adv/std_step_conf": 0.9356816411018372,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8258064516129031,
"calib/avg_num_step_conf": 6.07421875,
"calib/ece": 0.15480314960629915,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.48031496062992124,
"calib/gap": 0.5219745845552297,
"calib/mean_conf": 0.5991338582677166,
"calib/mu_c": 0.8025806451612902,
"calib/mu_w": 0.2806060606060606,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07185039370078736,
"calib/std_conf": 0.4342103405922523,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.49769696335078534,
"calib/step_q_c_n": 955.0,
"calib/step_q_gap": 0.10981363001745204,
"calib/step_q_w": 0.3878833333333333,
"calib/step_q_w_n": 600.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2560.0,
"completions/max_terminated_length": 2560.0,
"completions/mean_length": 457.97265625,
"completions/mean_terminated_length": 457.97265625,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.2112,
"grad_norm": 0.05018523335456848,
"kl": 0.1059112548828125,
"learning_rate": 5.555555555555556e-08,
"loss": 0.0534,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.037562139332294464,
"mask/share_reasoning": 0.8166549205780029,
"mask/share_step_conf": 0.14578291773796082,
"num_tokens": 46162626.0,
"reward": 1.2054414749145508,
"reward_std": 0.20616331696510315,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.8151402473449707,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.851328432559967,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.7400945425033569,
"adv/mean_abs_reasoning": 0.6025816202163696,
"adv/mean_abs_step_conf": 0.77257239818573,
"adv/ratio_final_to_reasoning": 1.2282063004802741,
"adv/ratio_step_to_reasoning": 1.2821041536386748,
"adv/std_final_conf": 0.906673014163971,
"adv/std_reasoning": 0.8266450762748718,
"adv/std_step_conf": 0.9356889724731445,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6141747828375889,
"calib/avg_num_step_conf": 5.8671875,
"calib/ece": 0.32704453441295545,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.582995951417004,
"calib/gap": 0.2014648591734668,
"calib/mean_conf": 0.6980566801619432,
"calib/mu_c": 0.7926717557251909,
"calib/mu_w": 0.5912068965517241,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.24736842105263157,
"calib/std_conf": 0.412814312622985,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5140081521739132,
"calib/step_q_c_n": 736.0,
"calib/step_q_gap": 0.11232407906686354,
"calib/step_q_w": 0.4016840731070496,
"calib/step_q_w_n": 766.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2784.0,
"completions/max_terminated_length": 2784.0,
"completions/mean_length": 544.16796875,
"completions/mean_terminated_length": 546.302001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.03123634122312069,
"kl": 0.0904998779296875,
"learning_rate": 2.777777777777778e-08,
"loss": 0.0078,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03431296348571777,
"mask/share_reasoning": 0.833673894405365,
"mask/share_step_conf": 0.12810686230659485,
"num_tokens": 46406133.0,
"reward": 1.0824717283248901,
"reward_std": 0.27605992555618286,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6297984719276428,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8265548944473267,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.6329021453857422,
"adv/mean_abs_reasoning": 0.4508885145187378,
"adv/mean_abs_step_conf": 0.7580750584602356,
"adv/ratio_final_to_reasoning": 1.403677683077111,
"adv/ratio_step_to_reasoning": 1.6812915699779527,
"adv/std_final_conf": 0.8475625514984131,
"adv/std_reasoning": 0.701388418674469,
"adv/std_step_conf": 0.9355641007423401,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8224703149199794,
"calib/avg_num_step_conf": 5.75,
"calib/ece": 0.191106719367589,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5652173913043478,
"calib/gap": 0.49758711925658233,
"calib/mean_conf": 0.6452569169960475,
"calib/mu_c": 0.8497986577181208,
"calib/mu_w": 0.35221153846153846,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12371541501976291,
"calib/std_conf": 0.4400858176299625,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5602846534653466,
"calib/step_q_c_n": 808.0,
"calib/step_q_gap": 0.21865814744125028,
"calib/step_q_w": 0.34162650602409633,
"calib/step_q_w_n": 664.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2535.0,
"completions/max_terminated_length": 2535.0,
"completions/mean_length": 520.86328125,
"completions/mean_terminated_length": 520.86328125,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.033193521201610565,
"kl": 0.0963134765625,
"learning_rate": 0.0,
"loss": 0.0717,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03284844756126404,
"mask/share_reasoning": 0.8499813079833984,
"mask/share_step_conf": 0.11717026680707932,
"num_tokens": 46647522.0,
"reward": 1.2013983726501465,
"reward_std": 0.2132171094417572,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7925854921340942,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8640990853309631,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.0034122529719024896,
"train_runtime": 12611.9052,
"train_samples_per_second": 4.06,
"train_steps_per_second": 0.016
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 46647522,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}