Files
PureRL-1.5B-v7-s2-l2-kl-w2-b1/trainer_state.json
ModelHub XC e29d610954 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-l2-kl-w2-b1
Source: Original Platform
2026-06-03 16:18:53 +08:00

12243 lines
503 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.773959219455719,
"adv/mean_abs_reasoning": 0.47714588046073914,
"adv/mean_abs_step_conf": 0.7498364448547363,
"adv/ratio_final_to_reasoning": 1.622059942565935,
"adv/ratio_step_to_reasoning": 1.5715035496705603,
"adv/std_final_conf": 0.9294352531433105,
"adv/std_reasoning": 0.7393431663513184,
"adv/std_step_conf": 0.9352971315383911,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.38076182006817844,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.2003187250996017,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": -0.026059730250481805,
"calib/mean_conf": 0.8737051792828686,
"calib/mu_c": 0.865606936416185,
"calib/mu_w": 0.8916666666666668,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19239043824701207,
"calib/std_conf": 0.09027744273295583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7959393232205367,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.006446568895645877,
"calib/step_q_w": 0.8023858921161826,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 474.94921875,
"completions/mean_terminated_length": 478.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.04300324618816376,
"kl": 0.000291675329208374,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0136,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03466901555657387,
"mask/share_reasoning": 0.8340686559677124,
"mask/share_step_conf": 0.12344987690448761,
"num_tokens": 229171.0,
"reward": 1.0788748264312744,
"reward_std": 0.22853493690490723,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7142800688743591,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7420004606246948,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.7672724723815918,
"adv/mean_abs_reasoning": 0.5104547739028931,
"adv/mean_abs_step_conf": 0.770571768283844,
"adv/ratio_final_to_reasoning": 1.503115479781084,
"adv/ratio_step_to_reasoning": 1.509578923891962,
"adv/std_final_conf": 0.9330522418022156,
"adv/std_reasoning": 0.7575037479400635,
"adv/std_step_conf": 0.9354329705238342,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.44343065693430656,
"calib/avg_num_step_conf": 5.05859375,
"calib/ece": 0.3349411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": 0.002352468143016151,
"calib/mean_conf": 0.8721960784313726,
"calib/mu_c": 0.8732846715328467,
"calib/mu_w": 0.8709322033898306,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3349411764705883,
"calib/std_conf": 0.07627016470309335,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7954391371340525,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.011011892552009073,
"calib/step_q_w": 0.7844272445820434,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 492.9765625,
"completions/mean_terminated_length": 494.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.040453653782606125,
"kl": 0.00037539005279541016,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0158,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03364308178424835,
"mask/share_reasoning": 0.8523939251899719,
"mask/share_step_conf": 0.11005672812461853,
"num_tokens": 458661.0,
"reward": 1.016056776046753,
"reward_std": 0.2184845209121704,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6320762038230896,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7291916012763977,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.7699410915374756,
"adv/mean_abs_reasoning": 0.4286423921585083,
"adv/mean_abs_step_conf": 0.7708143591880798,
"adv/ratio_final_to_reasoning": 1.7962317904682603,
"adv/ratio_step_to_reasoning": 1.7982690776488557,
"adv/std_final_conf": 0.9275014996528625,
"adv/std_reasoning": 0.7013915777206421,
"adv/std_step_conf": 0.9344233870506287,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.430653197733943,
"calib/avg_num_step_conf": 4.953125,
"calib/ece": 0.24111111111111116,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.28174603174603174,
"calib/gap": -0.010320114667940916,
"calib/mean_conf": 0.88,
"calib/mu_c": 0.8762732919254658,
"calib/mu_w": 0.8865934065934067,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.24111111111111116,
"calib/std_conf": 0.042323395908998626,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7988338192419825,
"calib/step_q_c_n": 686.0,
"calib/step_q_gap": 0.053954094156071886,
"calib/step_q_w": 0.7448797250859106,
"calib/step_q_w_n": 582.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2755.0,
"completions/max_terminated_length": 2755.0,
"completions/mean_length": 503.89453125,
"completions/mean_terminated_length": 505.87060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0032,
"grad_norm": 0.07180804014205933,
"kl": 0.0010092556476593018,
"learning_rate": 7.5e-07,
"loss": 0.0958,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.033395763486623764,
"mask/share_reasoning": 0.8536885976791382,
"mask/share_step_conf": 0.10900937020778656,
"num_tokens": 692914.0,
"reward": 1.0585122108459473,
"reward_std": 0.20415569841861725,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6930652260780334,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7347228527069092,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.7719908952713013,
"adv/mean_abs_reasoning": 0.37903717160224915,
"adv/mean_abs_step_conf": 0.7598022818565369,
"adv/ratio_final_to_reasoning": 2.0367155337508867,
"adv/ratio_step_to_reasoning": 2.0045587577723163,
"adv/std_final_conf": 0.9269395470619202,
"adv/std_reasoning": 0.6612975001335144,
"adv/std_step_conf": 0.9352869391441345,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5061005032053492,
"calib/avg_num_step_conf": 5.109375,
"calib/ece": 0.23214285714285715,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.28174603174603174,
"calib/gap": 0.00448473150892692,
"calib/mean_conf": 0.878968253968254,
"calib/mu_c": 0.880552147239264,
"calib/mu_w": 0.8760674157303371,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.23214285714285715,
"calib/std_conf": 0.0459266040595583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7920047449584816,
"calib/step_q_c_n": 843.0,
"calib/step_q_gap": 0.003940228829449333,
"calib/step_q_w": 0.7880645161290323,
"calib/step_q_w_n": 465.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2943.0,
"completions/max_terminated_length": 2943.0,
"completions/mean_length": 515.4375,
"completions/mean_terminated_length": 515.4375,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.041159238666296005,
"kl": 0.0002847015857696533,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0688,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03243076801300049,
"mask/share_reasoning": 0.8518642783164978,
"mask/share_step_conf": 0.11570495367050171,
"num_tokens": 931034.0,
"reward": 1.0581204891204834,
"reward_std": 0.20035883784294128,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7063945531845093,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7237518429756165,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.7795875072479248,
"adv/mean_abs_reasoning": 0.41760870814323425,
"adv/mean_abs_step_conf": 0.7794197201728821,
"adv/ratio_final_to_reasoning": 1.866789394105586,
"adv/ratio_step_to_reasoning": 1.8663876135110464,
"adv/std_final_conf": 0.9297929406166077,
"adv/std_reasoning": 0.6816225647926331,
"adv/std_step_conf": 0.9355724453926086,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.43911059700533384,
"calib/avg_num_step_conf": 4.703125,
"calib/ece": 0.3465999999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.308,
"calib/gap": -0.00953987532934919,
"calib/mean_conf": 0.8786,
"calib/mu_c": 0.8741353383458645,
"calib/mu_w": 0.8836752136752137,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.3465999999999999,
"calib/std_conf": 0.04492705198430006,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.7988571428571428,
"calib/step_q_c_n": 630.0,
"calib/step_q_gap": 0.010703832752613107,
"calib/step_q_w": 0.7881533101045297,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2643.0,
"completions/max_terminated_length": 2643.0,
"completions/mean_length": 528.9921875,
"completions/mean_terminated_length": 528.9921875,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.043477851897478104,
"kl": 0.00028324127197265625,
"learning_rate": 1.25e-06,
"loss": 0.0765,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03482171893119812,
"mask/share_reasoning": 0.8547407388687134,
"mask/share_step_conf": 0.1104375571012497,
"num_tokens": 1173144.0,
"reward": 0.9544405341148376,
"reward_std": 0.1946556568145752,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6074038743972778,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.6697347164154053,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.7901644706726074,
"adv/mean_abs_reasoning": 0.4342483878135681,
"adv/mean_abs_step_conf": 0.7639518976211548,
"adv/ratio_final_to_reasoning": 1.8196140569480743,
"adv/ratio_step_to_reasoning": 1.7592509703205512,
"adv/std_final_conf": 0.9303911328315735,
"adv/std_reasoning": 0.6817383766174316,
"adv/std_step_conf": 0.9353141784667969,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.525006314725941,
"calib/avg_num_step_conf": 5.03125,
"calib/ece": 0.30039215686274506,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.3176470588235294,
"calib/gap": 0.004571861581207259,
"calib/mean_conf": 0.8807843137254903,
"calib/mu_c": 0.8827027027027026,
"calib/mu_w": 0.8781308411214953,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.30039215686274506,
"calib/std_conf": 0.04316149051810785,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.793986577181208,
"calib/step_q_c_n": 745.0,
"calib/step_q_gap": -0.0015567009035065693,
"calib/step_q_w": 0.7955432780847146,
"calib/step_q_w_n": 543.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1498.0,
"completions/max_terminated_length": 1498.0,
"completions/mean_length": 445.44921875,
"completions/mean_terminated_length": 445.44921875,
"completions/min_length": 113.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.0064,
"grad_norm": 0.045135434716939926,
"kl": 0.0006567239761352539,
"learning_rate": 1.5e-06,
"loss": -0.004,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.036960337311029434,
"mask/share_reasoning": 0.8375744819641113,
"mask/share_step_conf": 0.12546522915363312,
"num_tokens": 1393131.0,
"reward": 1.0118639469146729,
"reward_std": 0.21928110718727112,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6584070324897766,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7018805742263794,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.7620992660522461,
"adv/mean_abs_reasoning": 0.5001809597015381,
"adv/mean_abs_step_conf": 0.7641808390617371,
"adv/ratio_final_to_reasoning": 1.523647094657495,
"adv/ratio_step_to_reasoning": 1.52780873449827,
"adv/std_final_conf": 0.9296735525131226,
"adv/std_reasoning": 0.7575913071632385,
"adv/std_step_conf": 0.9349011778831482,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4986007781038837,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.2416666666666666,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.32142857142857145,
"calib/gap": 0.004653607262303239,
"calib/mean_conf": 0.8805555555555555,
"calib/mu_c": 0.8822360248447206,
"calib/mu_w": 0.8775824175824174,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2416666666666666,
"calib/std_conf": 0.05412808964099344,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7983625730994154,
"calib/step_q_c_n": 855.0,
"calib/step_q_gap": 0.01571794500024193,
"calib/step_q_w": 0.7826446280991735,
"calib/step_q_w_n": 484.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2218.0,
"completions/max_terminated_length": 2218.0,
"completions/mean_length": 537.640625,
"completions/mean_terminated_length": 539.7490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.07143445312976837,
"kl": 0.014304488897323608,
"learning_rate": 1.75e-06,
"loss": 0.0537,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.030748117715120316,
"mask/share_reasoning": 0.858625054359436,
"mask/share_step_conf": 0.10672055184841156,
"num_tokens": 1638191.0,
"reward": 1.0623219013214111,
"reward_std": 0.2287452518939972,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.695104718208313,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.738442599773407,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.7889758944511414,
"adv/mean_abs_reasoning": 0.3992607593536377,
"adv/mean_abs_step_conf": 0.7824378609657288,
"adv/ratio_final_to_reasoning": 1.976091754492509,
"adv/ratio_step_to_reasoning": 1.959716407473691,
"adv/std_final_conf": 0.9319833517074585,
"adv/std_reasoning": 0.661307692527771,
"adv/std_step_conf": 0.9356229305267334,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.46211683053788316,
"calib/avg_num_step_conf": 4.51171875,
"calib/ece": 0.34732000000000013,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.32,
"calib/gap": -0.002402801876486005,
"calib/mean_conf": 0.87932,
"calib/mu_c": 0.8781954887218046,
"calib/mu_w": 0.8805982905982906,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.34732000000000013,
"calib/std_conf": 0.05104446688917419,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7933065810593901,
"calib/step_q_c_n": 623.0,
"calib/step_q_gap": 0.01719755850299909,
"calib/step_q_w": 0.776109022556391,
"calib/step_q_w_n": 532.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2145.0,
"completions/max_terminated_length": 2145.0,
"completions/mean_length": 516.56640625,
"completions/mean_terminated_length": 516.56640625,
"completions/min_length": 190.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.03993474319577217,
"kl": 0.0004799962043762207,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0164,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03236281871795654,
"mask/share_reasoning": 0.8639826774597168,
"mask/share_step_conf": 0.10365445911884308,
"num_tokens": 1876944.0,
"reward": 0.9967085123062134,
"reward_std": 0.19894912838935852,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.611905038356781,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7215287685394287,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.7554298639297485,
"adv/mean_abs_reasoning": 0.4521360397338867,
"adv/mean_abs_step_conf": 0.7652294635772705,
"adv/ratio_final_to_reasoning": 1.6708021425904716,
"adv/ratio_step_to_reasoning": 1.6924761494962022,
"adv/std_final_conf": 0.9297342896461487,
"adv/std_reasoning": 0.7205620408058167,
"adv/std_step_conf": 0.9354267716407776,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.41982228298017776,
"calib/avg_num_step_conf": 5.03515625,
"calib/ece": 0.25763052208835335,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.27309236947791166,
"calib/gap": -0.00714969241285035,
"calib/mean_conf": 0.8761044176706826,
"calib/mu_c": 0.8733766233766234,
"calib/mu_w": 0.8805263157894737,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.25763052208835335,
"calib/std_conf": 0.05478403327402361,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7702870090634442,
"calib/step_q_c_n": 662.0,
"calib/step_q_gap": 0.07661874750044573,
"calib/step_q_w": 0.6936682615629984,
"calib/step_q_w_n": 627.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2913.0,
"completions/max_terminated_length": 2913.0,
"completions/mean_length": 510.15234375,
"completions/mean_terminated_length": 514.1693115234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.0096,
"grad_norm": 0.04263272136449814,
"kl": 0.0003707706928253174,
"learning_rate": 2.25e-06,
"loss": -0.034,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.033698923885822296,
"mask/share_reasoning": 0.8547195196151733,
"mask/share_step_conf": 0.10376904904842377,
"num_tokens": 2115079.0,
"reward": 1.0097324848175049,
"reward_std": 0.24089282751083374,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6642941236495972,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.6951137781143188,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.7777752876281738,
"adv/mean_abs_reasoning": 0.4451920986175537,
"adv/mean_abs_step_conf": 0.7588284015655518,
"adv/ratio_final_to_reasoning": 1.7470554622226768,
"adv/ratio_step_to_reasoning": 1.7044965620951646,
"adv/std_final_conf": 0.9298601150512695,
"adv/std_reasoning": 0.7014667391777039,
"adv/std_step_conf": 0.9350449442863464,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5539309378185525,
"calib/avg_num_step_conf": 5.12890625,
"calib/ece": 0.3145849802371541,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.391304347826087,
"calib/gap": 0.01739678899082564,
"calib/mean_conf": 0.8837549407114624,
"calib/mu_c": 0.89125,
"calib/mu_w": 0.8738532110091743,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3145849802371541,
"calib/std_conf": 0.06500023734599751,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7907023188405797,
"calib/step_q_c_n": 690.0,
"calib/step_q_gap": 0.012018530718589382,
"calib/step_q_w": 0.7786837881219904,
"calib/step_q_w_n": 623.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2728.0,
"completions/max_terminated_length": 2728.0,
"completions/mean_length": 524.91796875,
"completions/mean_terminated_length": 524.91796875,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.04463927820324898,
"kl": 0.0003826320171356201,
"learning_rate": 2.5e-06,
"loss": 0.0955,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03257928788661957,
"mask/share_reasoning": 0.8568609952926636,
"mask/share_step_conf": 0.11055973172187805,
"num_tokens": 2356258.0,
"reward": 1.0168341398239136,
"reward_std": 0.21721628308296204,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6515105366706848,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7151883840560913,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.7630910277366638,
"adv/mean_abs_reasoning": 0.3175758421421051,
"adv/mean_abs_step_conf": 0.7779554128646851,
"adv/ratio_final_to_reasoning": 2.402862329166728,
"adv/ratio_step_to_reasoning": 2.449668109567902,
"adv/std_final_conf": 0.9286644458770752,
"adv/std_reasoning": 0.5960689783096313,
"adv/std_step_conf": 0.9352787733078003,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.43782581055308323,
"calib/avg_num_step_conf": 5.49609375,
"calib/ece": 0.3308300395256918,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.38735177865612647,
"calib/gap": -0.013174825174825266,
"calib/mean_conf": 0.8860079051383399,
"calib/mu_c": 0.8802797202797202,
"calib/mu_w": 0.8934545454545455,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.325810276679842,
"calib/std_conf": 0.058196477359325455,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.769406674907293,
"calib/step_q_c_n": 809.0,
"calib/step_q_gap": -0.02049299064454646,
"calib/step_q_w": 0.7898996655518394,
"calib/step_q_w_n": 598.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2209.0,
"completions/max_terminated_length": 2209.0,
"completions/mean_length": 515.4296875,
"completions/mean_terminated_length": 515.4296875,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.0435059629380703,
"kl": 0.0006912946701049805,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0438,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03301956132054329,
"mask/share_reasoning": 0.8474563360214233,
"mask/share_step_conf": 0.11952407658100128,
"num_tokens": 2592688.0,
"reward": 1.001125454902649,
"reward_std": 0.1699179857969284,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6315202713012695,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7086119651794434,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.757521390914917,
"adv/mean_abs_reasoning": 0.48119303584098816,
"adv/mean_abs_step_conf": 0.7516059279441833,
"adv/ratio_final_to_reasoning": 1.5742567628623005,
"adv/ratio_step_to_reasoning": 1.561963436629108,
"adv/std_final_conf": 0.928837239742279,
"adv/std_reasoning": 0.7393051385879517,
"adv/std_step_conf": 0.9350537657737732,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4660745384883316,
"calib/avg_num_step_conf": 5.46484375,
"calib/ece": 0.2371428571428571,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.39285714285714285,
"calib/gap": -0.005341692789968655,
"calib/mean_conf": 0.8885714285714287,
"calib/mu_c": 0.8867272727272727,
"calib/mu_w": 0.8920689655172414,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.23547619047619042,
"calib/std_conf": 0.05101687062566043,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7803575685339691,
"calib/step_q_c_n": 839.0,
"calib/step_q_gap": 0.029553997105397634,
"calib/step_q_w": 0.7508035714285715,
"calib/step_q_w_n": 560.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2466.0,
"completions/max_terminated_length": 2466.0,
"completions/mean_length": 468.87890625,
"completions/mean_terminated_length": 472.57086181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.0128,
"grad_norm": 0.0435611829161644,
"kl": 0.001415252685546875,
"learning_rate": 3e-06,
"loss": -0.0022,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03637603670358658,
"mask/share_reasoning": 0.828792929649353,
"mask/share_step_conf": 0.1270185112953186,
"num_tokens": 2816897.0,
"reward": 1.088067889213562,
"reward_std": 0.21567848324775696,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.6941168308258057,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7718667387962341,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.7475674152374268,
"adv/mean_abs_reasoning": 0.48703324794769287,
"adv/mean_abs_step_conf": 0.7698397040367126,
"adv/ratio_final_to_reasoning": 1.5349412353008702,
"adv/ratio_step_to_reasoning": 1.5806717657998435,
"adv/std_final_conf": 0.9303745031356812,
"adv/std_reasoning": 0.7574519515037537,
"adv/std_step_conf": 0.9354352355003357,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5476190476190477,
"calib/avg_num_step_conf": 4.99609375,
"calib/ece": 0.27944881889763784,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.421259842519685,
"calib/gap": 0.01138016745159598,
"calib/mean_conf": 0.8936220472440944,
"calib/mu_c": 0.8980128205128205,
"calib/mu_w": 0.8866326530612245,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.27944881889763784,
"calib/std_conf": 0.05016773897241201,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7593807641633729,
"calib/step_q_c_n": 759.0,
"calib/step_q_gap": 0.025323071855680612,
"calib/step_q_w": 0.7340576923076922,
"calib/step_q_w_n": 520.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1709.0,
"completions/max_terminated_length": 1709.0,
"completions/mean_length": 468.07421875,
"completions/mean_terminated_length": 469.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.038371726870536804,
"kl": 0.0019371509552001953,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.0169,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03500467911362648,
"mask/share_reasoning": 0.8416920304298401,
"mask/share_step_conf": 0.11939701437950134,
"num_tokens": 3041316.0,
"reward": 1.0731277465820312,
"reward_std": 0.2134917676448822,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6806222200393677,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7640678882598877,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.7789779901504517,
"adv/mean_abs_reasoning": 0.5143574476242065,
"adv/mean_abs_step_conf": 0.7580201625823975,
"adv/ratio_final_to_reasoning": 1.514468185011251,
"adv/ratio_step_to_reasoning": 1.4737225368926956,
"adv/std_final_conf": 0.9270016551017761,
"adv/std_reasoning": 0.7575810551643372,
"adv/std_step_conf": 0.9352178573608398,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4321884775808134,
"calib/avg_num_step_conf": 5.29296875,
"calib/ece": 0.3639357429718876,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6506024096385542,
"calib/gap": -0.004977189781021796,
"calib/mean_conf": 0.9141365461847389,
"calib/mu_c": 0.9118978102189782,
"calib/mu_w": 0.916875,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3639357429718876,
"calib/std_conf": 0.04368743294022322,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7413020134228188,
"calib/step_q_c_n": 745.0,
"calib/step_q_gap": 0.031105292111343275,
"calib/step_q_w": 0.7101967213114755,
"calib/step_q_w_n": 610.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2679.0,
"completions/max_terminated_length": 2679.0,
"completions/mean_length": 525.13671875,
"completions/mean_terminated_length": 531.3636474609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.03572245314717293,
"kl": 0.003782510757446289,
"learning_rate": 3.5e-06,
"loss": 0.0123,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03229822218418121,
"mask/share_reasoning": 0.8388671875,
"mask/share_step_conf": 0.11711588501930237,
"num_tokens": 3281151.0,
"reward": 1.0118708610534668,
"reward_std": 0.22134985029697418,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.5988633036613464,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.748877227306366,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.7903380393981934,
"adv/mean_abs_reasoning": 0.4590962529182434,
"adv/mean_abs_step_conf": 0.7628530859947205,
"adv/ratio_final_to_reasoning": 1.72150836425785,
"adv/ratio_step_to_reasoning": 1.661640845782661,
"adv/std_final_conf": 0.9195718765258789,
"adv/std_reasoning": 0.701473593711853,
"adv/std_step_conf": 0.935431182384491,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.47,
"calib/avg_num_step_conf": 4.86328125,
"calib/ece": 0.309763779527559,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6968503937007874,
"calib/gap": 0.001454545454545153,
"calib/mean_conf": 0.9148818897637795,
"calib/mu_c": 0.9154545454545452,
"calib/mu_w": 0.914,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.30917322834645666,
"calib/std_conf": 0.05821310486077129,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6812092391304347,
"calib/step_q_c_n": 736.0,
"calib/step_q_gap": -0.02122691018194245,
"calib/step_q_w": 0.7024361493123772,
"calib/step_q_w_n": 509.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2354.0,
"completions/max_terminated_length": 2354.0,
"completions/mean_length": 454.70703125,
"completions/mean_terminated_length": 454.70703125,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.016,
"grad_norm": 0.03302004188299179,
"kl": 0.007363319396972656,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0141,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.035681482404470444,
"mask/share_reasoning": 0.848019003868103,
"mask/share_step_conf": 0.11629950255155563,
"num_tokens": 3505436.0,
"reward": 1.0653555393218994,
"reward_std": 0.22157421708106995,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6581991910934448,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7691745758056641,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.7543657422065735,
"adv/mean_abs_reasoning": 0.43760305643081665,
"adv/mean_abs_step_conf": 0.7942132949829102,
"adv/ratio_final_to_reasoning": 1.7238584857229757,
"adv/ratio_step_to_reasoning": 1.8149171567965778,
"adv/std_final_conf": 0.9253305792808533,
"adv/std_reasoning": 0.72050940990448,
"adv/std_step_conf": 0.9354434013366699,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.48640000000000005,
"calib/avg_num_step_conf": 6.37890625,
"calib/ece": 0.3292400000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.796,
"calib/gap": 0.007899999999999796,
"calib/mean_conf": 0.92924,
"calib/mu_c": 0.9324000000000001,
"calib/mu_w": 0.9245000000000003,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3292400000000001,
"calib/std_conf": 0.04746179937591914,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6588923719958202,
"calib/step_q_c_n": 957.0,
"calib/step_q_gap": 0.04671781578280232,
"calib/step_q_w": 0.6121745562130179,
"calib/step_q_w_n": 676.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2956.0,
"completions/max_terminated_length": 2956.0,
"completions/mean_length": 636.96875,
"completions/mean_terminated_length": 641.9842529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.03509129211306572,
"kl": 0.008379936218261719,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0631,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.026487503200769424,
"mask/share_reasoning": 0.858582615852356,
"mask/share_step_conf": 0.10711735486984253,
"num_tokens": 3777348.0,
"reward": 1.0628373622894287,
"reward_std": 0.21678559482097626,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6378324031829834,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7835614681243896,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.7382602691650391,
"adv/mean_abs_reasoning": 0.39500361680984497,
"adv/mean_abs_step_conf": 0.7596269249916077,
"adv/ratio_final_to_reasoning": 1.8689962262306021,
"adv/ratio_step_to_reasoning": 1.9230885304963996,
"adv/std_final_conf": 0.9201560616493225,
"adv/std_reasoning": 0.6816303730010986,
"adv/std_step_conf": 0.9354602098464966,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5855099956223553,
"calib/avg_num_step_conf": 5.71875,
"calib/ece": 0.24278431372549014,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7686274509803922,
"calib/gap": 0.011470888661899714,
"calib/mean_conf": 0.9270980392156863,
"calib/mu_c": 0.9305617977528089,
"calib/mu_w": 0.9190909090909092,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.23592156862745092,
"calib/std_conf": 0.05528012616751612,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6086591123701605,
"calib/step_q_c_n": 1059.0,
"calib/step_q_gap": -0.0021803938026789815,
"calib/step_q_w": 0.6108395061728394,
"calib/step_q_w_n": 405.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1437.0,
"completions/max_terminated_length": 1437.0,
"completions/mean_length": 492.609375,
"completions/mean_terminated_length": 494.54119873046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.048337530344724655,
"kl": 0.013302803039550781,
"learning_rate": 4.25e-06,
"loss": 0.0273,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03347424417734146,
"mask/share_reasoning": 0.8387426137924194,
"mask/share_step_conf": 0.12387684732675552,
"num_tokens": 4006984.0,
"reward": 1.1489111185073853,
"reward_std": 0.20086175203323364,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.7343758344650269,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8183392882347107,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.7539812922477722,
"adv/mean_abs_reasoning": 0.4319751560688019,
"adv/mean_abs_step_conf": 0.7524322271347046,
"adv/ratio_final_to_reasoning": 1.7454274433497363,
"adv/ratio_step_to_reasoning": 1.7418414382489686,
"adv/std_final_conf": 0.9187299013137817,
"adv/std_reasoning": 0.7205332517623901,
"adv/std_step_conf": 0.9355179667472839,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5428461142563047,
"calib/avg_num_step_conf": 4.62890625,
"calib/ece": 0.4055599999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.86,
"calib/gap": 0.007656973751930085,
"calib/mean_conf": 0.93988,
"calib/mu_c": 0.9434328358208955,
"calib/mu_w": 0.9357758620689655,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.4047199999999999,
"calib/std_conf": 0.04093391747683087,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6104299363057325,
"calib/step_q_c_n": 628.0,
"calib/step_q_gap": 0.02750354492332674,
"calib/step_q_w": 0.5829263913824058,
"calib/step_q_w_n": 557.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2882.0,
"completions/max_terminated_length": 2882.0,
"completions/mean_length": 484.90234375,
"completions/mean_terminated_length": 490.6521911621094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.0192,
"grad_norm": 0.02956167608499527,
"kl": 0.015323638916015625,
"learning_rate": 4.5e-06,
"loss": -0.0949,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.033106379210948944,
"mask/share_reasoning": 0.848029375076294,
"mask/share_step_conf": 0.1071455180644989,
"num_tokens": 4241839.0,
"reward": 1.012449860572815,
"reward_std": 0.20409558713436127,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.5725746154785156,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7687375545501709,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.725152313709259,
"adv/mean_abs_reasoning": 0.39276736974716187,
"adv/mean_abs_step_conf": 0.7688818573951721,
"adv/ratio_final_to_reasoning": 1.846264149122329,
"adv/ratio_step_to_reasoning": 1.95760115686323,
"adv/std_final_conf": 0.9252442717552185,
"adv/std_reasoning": 0.6815301775932312,
"adv/std_step_conf": 0.9354442358016968,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5053263966307444,
"calib/avg_num_step_conf": 4.5546875,
"calib/ece": 0.3896470588235294,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.8549019607843137,
"calib/gap": 0.025685618729096693,
"calib/mean_conf": 0.9308235294117647,
"calib/mu_c": 0.9426086956521738,
"calib/mu_w": 0.9169230769230771,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3896470588235294,
"calib/std_conf": 0.09485855432547834,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.565136,
"calib/step_q_c_n": 625.0,
"calib/step_q_gap": 0.009831009242144084,
"calib/step_q_w": 0.5553049907578559,
"calib/step_q_w_n": 541.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2276.0,
"completions/max_terminated_length": 2276.0,
"completions/mean_length": 481.9921875,
"completions/mean_terminated_length": 481.9921875,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.02938079461455345,
"kl": 0.022005081176757812,
"learning_rate": 4.75e-06,
"loss": 0.0076,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.033192120492458344,
"mask/share_reasoning": 0.8598717451095581,
"mask/share_step_conf": 0.10693618655204773,
"num_tokens": 4469989.0,
"reward": 1.0599896907806396,
"reward_std": 0.19520623981952667,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.601270318031311,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8077850341796875,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.7167485952377319,
"adv/mean_abs_reasoning": 0.3509178161621094,
"adv/mean_abs_step_conf": 0.7590775489807129,
"adv/ratio_final_to_reasoning": 2.0424970241654075,
"adv/ratio_step_to_reasoning": 2.1631205770129687,
"adv/std_final_conf": 0.9080055356025696,
"adv/std_reasoning": 0.661155104637146,
"adv/std_step_conf": 0.935492992401123,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.47865067079463364,
"calib/avg_num_step_conf": 5.5234375,
"calib/ece": 0.41544000000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.948,
"calib/gap": -0.002799277605779249,
"calib/mean_conf": 0.9575999999999999,
"calib/mu_c": 0.9563235294117647,
"calib/mu_w": 0.9591228070175439,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.41452,
"calib/std_conf": 0.03253060097815593,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.524655172413793,
"calib/step_q_c_n": 696.0,
"calib/step_q_gap": 0.036813946787052076,
"calib/step_q_w": 0.4878412256267409,
"calib/step_q_w_n": 718.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2542.0,
"completions/max_terminated_length": 2542.0,
"completions/mean_length": 496.65234375,
"completions/mean_terminated_length": 496.65234375,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.024263571947813034,
"kl": 0.030767440795898438,
"learning_rate": 5e-06,
"loss": 0.0385,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.035842910408973694,
"mask/share_reasoning": 0.8330389261245728,
"mask/share_step_conf": 0.13111810386180878,
"num_tokens": 4702004.0,
"reward": 1.026486873626709,
"reward_std": 0.18262585997581482,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.5648671984672546,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7905083894729614,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.7449398040771484,
"adv/mean_abs_reasoning": 0.5243001580238342,
"adv/mean_abs_step_conf": 0.7196472883224487,
"adv/ratio_final_to_reasoning": 1.4208269684391057,
"adv/ratio_step_to_reasoning": 1.372586441772032,
"adv/std_final_conf": 0.9047093987464905,
"adv/std_reasoning": 0.7753930687904358,
"adv/std_step_conf": 0.9356669187545776,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.48425430778371953,
"calib/avg_num_step_conf": 5.73828125,
"calib/ece": 0.3716269841269841,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9603174603174603,
"calib/gap": -0.011437908496732097,
"calib/mean_conf": 0.9630555555555556,
"calib/mu_c": 0.9585620915032681,
"calib/mu_w": 0.9700000000000002,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36376984126984124,
"calib/std_conf": 0.06451120310673689,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5064848484848485,
"calib/step_q_c_n": 825.0,
"calib/step_q_gap": 0.01601900997553174,
"calib/step_q_w": 0.4904658385093168,
"calib/step_q_w_n": 644.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2817.0,
"completions/max_terminated_length": 2817.0,
"completions/mean_length": 509.41015625,
"completions/mean_terminated_length": 509.41015625,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.0224,
"grad_norm": 0.023283572867512703,
"kl": 0.03099822998046875,
"learning_rate": 4.9722222222222224e-06,
"loss": 0.0217,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.033678602427244186,
"mask/share_reasoning": 0.8410661220550537,
"mask/share_step_conf": 0.1252552568912506,
"num_tokens": 4935373.0,
"reward": 1.068263053894043,
"reward_std": 0.23257187008857727,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6154191493988037,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8026129007339478,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.6880729794502258,
"adv/mean_abs_reasoning": 0.2956817150115967,
"adv/mean_abs_step_conf": 0.7522487640380859,
"adv/ratio_final_to_reasoning": 2.327073148311655,
"adv/ratio_step_to_reasoning": 2.544116615424064,
"adv/std_final_conf": 0.8778355121612549,
"adv/std_reasoning": 0.5959526896476746,
"adv/std_step_conf": 0.9354989528656006,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4271016650873155,
"calib/avg_num_step_conf": 5.609375,
"calib/ece": 0.32290196078431377,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9764705882352941,
"calib/gap": -0.007377149045620923,
"calib/mean_conf": 0.9656470588235294,
"calib/mu_c": 0.9630722891566263,
"calib/mu_w": 0.9704494382022473,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31878431372549026,
"calib/std_conf": 0.045411764705882346,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4753470715835141,
"calib/step_q_c_n": 922.0,
"calib/step_q_gap": 0.011572752517366225,
"calib/step_q_w": 0.46377431906614786,
"calib/step_q_w_n": 514.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2435.0,
"completions/max_terminated_length": 2435.0,
"completions/mean_length": 474.9765625,
"completions/mean_terminated_length": 474.9765625,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.029273828491568565,
"kl": 0.039241790771484375,
"learning_rate": 4.944444444444445e-06,
"loss": 0.0079,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03368385136127472,
"mask/share_reasoning": 0.8386745452880859,
"mask/share_step_conf": 0.12764160335063934,
"num_tokens": 5158783.0,
"reward": 1.1068484783172607,
"reward_std": 0.15006275475025177,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.6657546758651733,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.812690794467926,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.7450026869773865,
"adv/mean_abs_reasoning": 0.5139386653900146,
"adv/mean_abs_step_conf": 0.757665753364563,
"adv/ratio_final_to_reasoning": 1.449594547263774,
"adv/ratio_step_to_reasoning": 1.4742338033461448,
"adv/std_final_conf": 0.9089747667312622,
"adv/std_reasoning": 0.7575487494468689,
"adv/std_step_conf": 0.9355910420417786,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.518332713984888,
"calib/avg_num_step_conf": 5.36328125,
"calib/ece": 0.4283921568627452,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9725490196078431,
"calib/gap": 0.0003084355258267113,
"calib/mean_conf": 0.9695686274509804,
"calib/mu_c": 0.9697101449275362,
"calib/mu_w": 0.9694017094017094,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4283921568627452,
"calib/std_conf": 0.03272299359595352,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.479627659574468,
"calib/step_q_c_n": 752.0,
"calib/step_q_gap": 0.017357128173501768,
"calib/step_q_w": 0.46227053140096624,
"calib/step_q_w_n": 621.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1287.0,
"completions/max_terminated_length": 1287.0,
"completions/mean_length": 499.3984375,
"completions/mean_terminated_length": 501.3569030761719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.03355023264884949,
"kl": 0.0365142822265625,
"learning_rate": 4.9166666666666665e-06,
"loss": -0.0396,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.035151075571775436,
"mask/share_reasoning": 0.8352031707763672,
"mask/share_step_conf": 0.12573951482772827,
"num_tokens": 5390565.0,
"reward": 1.0469937324523926,
"reward_std": 0.2159760296344757,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.5650421380996704,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8146092891693115,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.7351275682449341,
"adv/mean_abs_reasoning": 0.5750550627708435,
"adv/mean_abs_step_conf": 0.7765597105026245,
"adv/ratio_final_to_reasoning": 1.278360309885453,
"adv/ratio_step_to_reasoning": 1.3504093099552097,
"adv/std_final_conf": 0.9156761169433594,
"adv/std_reasoning": 0.8098498582839966,
"adv/std_step_conf": 0.9356335997581482,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.536032258064516,
"calib/avg_num_step_conf": 6.109375,
"calib/ece": 0.47028112449799203,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.963855421686747,
"calib/gap": 0.006545161290322676,
"calib/mean_conf": 0.965140562248996,
"calib/mu_c": 0.9684,
"calib/mu_w": 0.9618548387096774,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.466706827309237,
"calib/std_conf": 0.07599461195023637,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.490234375,
"calib/step_q_c_n": 768.0,
"calib/step_q_gap": 0.024204224246231087,
"calib/step_q_w": 0.4660301507537689,
"calib/step_q_w_n": 796.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2551.0,
"completions/max_terminated_length": 2551.0,
"completions/mean_length": 561.50390625,
"completions/mean_terminated_length": 563.7059326171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.0256,
"grad_norm": 0.024830345064401627,
"kl": 0.04190635681152344,
"learning_rate": 4.888888888888889e-06,
"loss": 0.0675,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03164277225732803,
"mask/share_reasoning": 0.8389121294021606,
"mask/share_step_conf": 0.12553885579109192,
"num_tokens": 5638822.0,
"reward": 1.0030418634414673,
"reward_std": 0.24848011136054993,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.5184351205825806,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7969739437103271,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.7402602434158325,
"adv/mean_abs_reasoning": 0.42128893733024597,
"adv/mean_abs_step_conf": 0.7585857510566711,
"adv/ratio_final_to_reasoning": 1.7571319297082486,
"adv/ratio_step_to_reasoning": 1.8006305977648307,
"adv/std_final_conf": 0.9028820991516113,
"adv/std_reasoning": 0.6817774176597595,
"adv/std_step_conf": 0.935698926448822,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5652805280528053,
"calib/avg_num_step_conf": 5.62890625,
"calib/ece": 0.38023904382470125,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9840637450199203,
"calib/gap": 0.0013273927392741802,
"calib/mean_conf": 0.9701992031872511,
"calib/mu_c": 0.9707333333333334,
"calib/mu_w": 0.9694059405940593,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.37641434262948215,
"calib/std_conf": 0.0643908025849377,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5059181141439206,
"calib/step_q_c_n": 806.0,
"calib/step_q_gap": 0.03667401965573158,
"calib/step_q_w": 0.46924409448818905,
"calib/step_q_w_n": 635.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2649.0,
"completions/max_terminated_length": 2649.0,
"completions/mean_length": 488.55078125,
"completions/mean_terminated_length": 490.4667053222656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.0347868986427784,
"kl": 0.035980224609375,
"learning_rate": 4.861111111111111e-06,
"loss": 0.0343,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03435831144452095,
"mask/share_reasoning": 0.8342337012290955,
"mask/share_step_conf": 0.1275017410516739,
"num_tokens": 5867115.0,
"reward": 1.0532389879226685,
"reward_std": 0.2238880842924118,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6051421165466309,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7920362949371338,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.7297452092170715,
"adv/mean_abs_reasoning": 0.36826425790786743,
"adv/mean_abs_step_conf": 0.749592661857605,
"adv/ratio_final_to_reasoning": 1.9815803286552984,
"adv/ratio_step_to_reasoning": 2.0354749225897955,
"adv/std_final_conf": 0.882972002029419,
"adv/std_reasoning": 0.6402753591537476,
"adv/std_step_conf": 0.9351133704185486,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5537442689760571,
"calib/avg_num_step_conf": 5.4296875,
"calib/ece": 0.3780392156862745,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.984313725490196,
"calib/gap": 0.006663907284768289,
"calib/mean_conf": 0.9701960784313725,
"calib/mu_c": 0.9729139072847683,
"calib/mu_w": 0.96625,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3780392156862745,
"calib/std_conf": 0.02694149042257921,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5008847184986595,
"calib/step_q_c_n": 746.0,
"calib/step_q_gap": 0.02464248247381473,
"calib/step_q_w": 0.4762422360248447,
"calib/step_q_w_n": 644.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2088.0,
"completions/max_terminated_length": 2088.0,
"completions/mean_length": 484.125,
"completions/mean_terminated_length": 486.0235595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.025464007630944252,
"kl": 0.036617279052734375,
"learning_rate": 4.833333333333333e-06,
"loss": -0.0449,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03218268230557442,
"mask/share_reasoning": 0.8462972640991211,
"mask/share_step_conf": 0.1176137626171112,
"num_tokens": 6096291.0,
"reward": 1.0802245140075684,
"reward_std": 0.17607977986335754,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6156578063964844,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8184024095535278,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.7717294692993164,
"adv/mean_abs_reasoning": 0.4704931378364563,
"adv/mean_abs_step_conf": 0.772092878818512,
"adv/ratio_final_to_reasoning": 1.6402565887529905,
"adv/ratio_step_to_reasoning": 1.6410289900697594,
"adv/std_final_conf": 0.9117308259010315,
"adv/std_reasoning": 0.7206444144248962,
"adv/std_step_conf": 0.9351636171340942,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5027194535795598,
"calib/avg_num_step_conf": 6.2734375,
"calib/ece": 0.4304761904761905,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9801587301587301,
"calib/gap": 0.018547938274728004,
"calib/mean_conf": 0.9595238095238096,
"calib/mu_c": 0.9682089552238806,
"calib/mu_w": 0.9496610169491526,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.42912698412698413,
"calib/std_conf": 0.09490477385133608,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4985406698564593,
"calib/step_q_c_n": 836.0,
"calib/step_q_gap": 0.017735475051264438,
"calib/step_q_w": 0.48080519480519485,
"calib/step_q_w_n": 770.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2749.0,
"completions/max_terminated_length": 2749.0,
"completions/mean_length": 496.59375,
"completions/mean_terminated_length": 500.5039367675781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.0288,
"grad_norm": 0.029237594455480576,
"kl": 0.035808563232421875,
"learning_rate": 4.805555555555556e-06,
"loss": -0.0563,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03279014676809311,
"mask/share_reasoning": 0.8255751729011536,
"mask/share_step_conf": 0.13382220268249512,
"num_tokens": 6328635.0,
"reward": 1.0464885234832764,
"reward_std": 0.22028236091136932,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.5554590225219727,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8239703178405762,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.7000174522399902,
"adv/mean_abs_reasoning": 0.3728755712509155,
"adv/mean_abs_step_conf": 0.7837008237838745,
"adv/ratio_final_to_reasoning": 1.877348655186999,
"adv/ratio_step_to_reasoning": 2.1017757241503663,
"adv/std_final_conf": 0.8813297152519226,
"adv/std_reasoning": 0.6612759828567505,
"adv/std_step_conf": 0.9354707598686218,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5263845889232885,
"calib/avg_num_step_conf": 5.671875,
"calib/ece": 0.35358870967741945,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9838709677419355,
"calib/gap": 0.0007285861713107744,
"calib/mean_conf": 0.9669758064516131,
"calib/mu_c": 0.9672549019607843,
"calib/mu_w": 0.9665263157894736,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.35181451612903236,
"calib/std_conf": 0.03516274689939519,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.497979094076655,
"calib/step_q_c_n": 861.0,
"calib/step_q_gap": 0.018402105920986678,
"calib/step_q_w": 0.47957698815566835,
"calib/step_q_w_n": 591.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2477.0,
"completions/max_terminated_length": 2477.0,
"completions/mean_length": 556.34375,
"completions/mean_terminated_length": 560.7244262695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.03544874116778374,
"kl": 0.033390045166015625,
"learning_rate": 4.777777777777778e-06,
"loss": -0.0343,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03131452947854996,
"mask/share_reasoning": 0.8450330495834351,
"mask/share_step_conf": 0.11583994328975677,
"num_tokens": 6578003.0,
"reward": 1.0687757730484009,
"reward_std": 0.1918598711490631,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6202456951141357,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8026830554008484,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.7551331520080566,
"adv/mean_abs_reasoning": 0.5309878587722778,
"adv/mean_abs_step_conf": 0.757839024066925,
"adv/ratio_final_to_reasoning": 1.4221288482076329,
"adv/ratio_step_to_reasoning": 1.4272247689790132,
"adv/std_final_conf": 0.9082273840904236,
"adv/std_reasoning": 0.7754151225090027,
"adv/std_step_conf": 0.9351565837860107,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6091259802681508,
"calib/avg_num_step_conf": 5.9375,
"calib/ece": 0.4382936507936509,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9801587301587301,
"calib/gap": 0.008840121426764491,
"calib/mean_conf": 0.9700396825396826,
"calib/mu_c": 0.9741791044776119,
"calib/mu_w": 0.9653389830508474,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4382936507936509,
"calib/std_conf": 0.021903431924197383,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4950569800569801,
"calib/step_q_c_n": 702.0,
"calib/step_q_gap": 0.011951845582652498,
"calib/step_q_w": 0.4831051344743276,
"calib/step_q_w_n": 818.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2469.0,
"completions/max_terminated_length": 2469.0,
"completions/mean_length": 552.953125,
"completions/mean_terminated_length": 555.12158203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.02542717568576336,
"kl": 0.03304290771484375,
"learning_rate": 4.75e-06,
"loss": -0.0293,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.030591856688261032,
"mask/share_reasoning": 0.8471544981002808,
"mask/share_step_conf": 0.1183474063873291,
"num_tokens": 6826687.0,
"reward": 1.050892949104309,
"reward_std": 0.22539781033992767,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.5498980283737183,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8345919251441956,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.7546311616897583,
"adv/mean_abs_reasoning": 0.6352213621139526,
"adv/mean_abs_step_conf": 0.7367050051689148,
"adv/ratio_final_to_reasoning": 1.1879813978207878,
"adv/ratio_step_to_reasoning": 1.1597610677280041,
"adv/std_final_conf": 0.9116274118423462,
"adv/std_reasoning": 0.8429492712020874,
"adv/std_step_conf": 0.9353036880493164,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5105012878937983,
"calib/avg_num_step_conf": 6.34765625,
"calib/ece": 0.3717199999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.968,
"calib/gap": 0.023630539594478583,
"calib/mean_conf": 0.95972,
"calib/mu_c": 0.9694557823129251,
"calib/mu_w": 0.9458252427184465,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3717199999999999,
"calib/std_conf": 0.09404850663354522,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4668671963677639,
"calib/step_q_c_n": 881.0,
"calib/step_q_gap": 0.034877949055935886,
"calib/step_q_w": 0.43198924731182803,
"calib/step_q_w_n": 744.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2652.0,
"completions/max_terminated_length": 2652.0,
"completions/mean_length": 620.7265625,
"completions/mean_terminated_length": 623.1608276367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 203.0,
"epoch": 0.032,
"grad_norm": 0.030656639486551285,
"kl": 0.038387298583984375,
"learning_rate": 4.722222222222222e-06,
"loss": -0.0384,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.027659885585308075,
"mask/share_reasoning": 0.8523433208465576,
"mask/share_step_conf": 0.11609058082103729,
"num_tokens": 7092577.0,
"reward": 1.0673344135284424,
"reward_std": 0.26539111137390137,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6036843657493591,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8144062757492065,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.7504103183746338,
"adv/mean_abs_reasoning": 0.48929375410079956,
"adv/mean_abs_step_conf": 0.7482815384864807,
"adv/ratio_final_to_reasoning": 1.5336601215229113,
"adv/ratio_step_to_reasoning": 1.529309402000515,
"adv/std_final_conf": 0.8871038556098938,
"adv/std_reasoning": 0.7393369674682617,
"adv/std_step_conf": 0.9349742531776428,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5178012764359905,
"calib/avg_num_step_conf": 6.8046875,
"calib/ece": 0.4745454545454547,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9565217391304348,
"calib/gap": 0.02865473657865092,
"calib/mean_conf": 0.9533596837944663,
"calib/mu_c": 0.9681967213114755,
"calib/mu_w": 0.9395419847328246,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4728458498023717,
"calib/std_conf": 0.12559808331621283,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.44105744125326374,
"calib/step_q_c_n": 766.0,
"calib/step_q_gap": 0.00939760518768995,
"calib/step_q_w": 0.4316598360655738,
"calib/step_q_w_n": 976.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2590.0,
"completions/max_terminated_length": 2590.0,
"completions/mean_length": 583.03125,
"completions/mean_terminated_length": 583.03125,
"completions/min_length": 187.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.03860628604888916,
"kl": 0.0362091064453125,
"learning_rate": 4.694444444444445e-06,
"loss": 0.0399,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03051367774605751,
"mask/share_reasoning": 0.8445330858230591,
"mask/share_step_conf": 0.12495321035385132,
"num_tokens": 7347745.0,
"reward": 1.033031702041626,
"reward_std": 0.19907008111476898,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.5206976532936096,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8349312543869019,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.7279558777809143,
"adv/mean_abs_reasoning": 0.4749346375465393,
"adv/mean_abs_step_conf": 0.7677797079086304,
"adv/ratio_final_to_reasoning": 1.5327496043275664,
"adv/ratio_step_to_reasoning": 1.6166007850572803,
"adv/std_final_conf": 0.8810470104217529,
"adv/std_reasoning": 0.7392776608467102,
"adv/std_step_conf": 0.9347826242446899,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.661967418546366,
"calib/avg_num_step_conf": 5.70703125,
"calib/ece": 0.4214624505928855,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9565217391304348,
"calib/gap": 0.05381265664160384,
"calib/mean_conf": 0.9462055335968379,
"calib/mu_c": 0.9717293233082706,
"calib/mu_w": 0.9179166666666667,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4209881422924902,
"calib/std_conf": 0.1326554172743245,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.42289875173370317,
"calib/step_q_c_n": 721.0,
"calib/step_q_gap": 0.058993346328297835,
"calib/step_q_w": 0.36390540540540534,
"calib/step_q_w_n": 740.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3056.0,
"completions/max_terminated_length": 3056.0,
"completions/mean_length": 518.2421875,
"completions/mean_terminated_length": 520.2745361328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.03688067942857742,
"kl": 0.06610107421875,
"learning_rate": 4.666666666666667e-06,
"loss": -0.0832,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03253469243645668,
"mask/share_reasoning": 0.8402843475341797,
"mask/share_step_conf": 0.12327471375465393,
"num_tokens": 7587119.0,
"reward": 1.0722166299819946,
"reward_std": 0.19639672338962555,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.5762332081794739,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8444249629974365,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.7129478454589844,
"adv/mean_abs_reasoning": 0.39785322546958923,
"adv/mean_abs_step_conf": 0.7419391870498657,
"adv/ratio_final_to_reasoning": 1.791987094279521,
"adv/ratio_step_to_reasoning": 1.864856533899277,
"adv/std_final_conf": 0.8591421842575073,
"adv/std_reasoning": 0.6613561511039734,
"adv/std_step_conf": 0.9346285462379456,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6605691056910569,
"calib/avg_num_step_conf": 6.5234375,
"calib/ece": 0.4621115537848607,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9760956175298805,
"calib/gap": 0.015661839430894386,
"calib/mean_conf": 0.9690438247011953,
"calib/mu_c": 0.9767187500000002,
"calib/mu_w": 0.9610569105691058,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4605976095617531,
"calib/std_conf": 0.04849818322995946,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.38405370843989767,
"calib/step_q_c_n": 782.0,
"calib/step_q_gap": 0.02532623096242026,
"calib/step_q_w": 0.3587274774774774,
"calib/step_q_w_n": 888.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2521.0,
"completions/max_terminated_length": 2521.0,
"completions/mean_length": 528.47265625,
"completions/mean_terminated_length": 530.5451049804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.0352,
"grad_norm": 0.03596274182200432,
"kl": 0.14304733276367188,
"learning_rate": 4.638888888888889e-06,
"loss": -0.0427,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03191022202372551,
"mask/share_reasoning": 0.836162805557251,
"mask/share_step_conf": 0.12802070379257202,
"num_tokens": 7829280.0,
"reward": 1.0433704853057861,
"reward_std": 0.17046688497066498,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.534176230430603,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8376471996307373,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.7462908029556274,
"adv/mean_abs_reasoning": 0.5331032872200012,
"adv/mean_abs_step_conf": 0.7383915185928345,
"adv/ratio_final_to_reasoning": 1.3998990830601423,
"adv/ratio_step_to_reasoning": 1.3850815335305087,
"adv/std_final_conf": 0.9107847213745117,
"adv/std_reasoning": 0.7754673957824707,
"adv/std_step_conf": 0.934866726398468,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5206094364351245,
"calib/avg_num_step_conf": 6.18359375,
"calib/ece": 0.4085542168674698,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9196787148594378,
"calib/gap": -0.011357798165137711,
"calib/mean_conf": 0.9529718875502009,
"calib/mu_c": 0.9479999999999998,
"calib/mu_w": 0.9593577981651376,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.39963855421686734,
"calib/std_conf": 0.10265943234329516,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.3455625790139064,
"calib/step_q_c_n": 791.0,
"calib/step_q_gap": 0.023289851741179146,
"calib/step_q_w": 0.32227272727272727,
"calib/step_q_w_n": 792.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2105.0,
"completions/max_terminated_length": 2105.0,
"completions/mean_length": 461.56640625,
"completions/mean_terminated_length": 467.03955078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.04626227170228958,
"kl": 0.05771636962890625,
"learning_rate": 4.611111111111112e-06,
"loss": -0.141,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03449372947216034,
"mask/share_reasoning": 0.8127261400222778,
"mask/share_step_conf": 0.14106135070323944,
"num_tokens": 8052553.0,
"reward": 1.0586323738098145,
"reward_std": 0.21905234456062317,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.5688515901565552,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8296712040901184,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.768435001373291,
"adv/mean_abs_reasoning": 0.5288388729095459,
"adv/mean_abs_step_conf": 0.722439706325531,
"adv/ratio_final_to_reasoning": 1.4530607350128104,
"adv/ratio_step_to_reasoning": 1.3660866160440122,
"adv/std_final_conf": 0.9130533337593079,
"adv/std_reasoning": 0.7755023837089539,
"adv/std_step_conf": 0.935539722442627,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6663034308211473,
"calib/avg_num_step_conf": 6.1484375,
"calib/ece": 0.3864435146443515,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.8493723849372385,
"calib/gap": 0.09799564116985382,
"calib/mean_conf": 0.910376569037657,
"calib/mu_c": 0.9562992125984252,
"calib/mu_w": 0.8583035714285714,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3827196652719666,
"calib/std_conf": 0.20581165079173147,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.33267737617135207,
"calib/step_q_c_n": 747.0,
"calib/step_q_gap": 0.02509575585696272,
"calib/step_q_w": 0.30758162031438935,
"calib/step_q_w_n": 827.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2553.0,
"completions/max_terminated_length": 2553.0,
"completions/mean_length": 547.1171875,
"completions/mean_terminated_length": 551.4251708984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.07981492578983307,
"kl": 0.056163787841796875,
"learning_rate": 4.583333333333333e-06,
"loss": -0.1449,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.028393005952239037,
"mask/share_reasoning": 0.8495216369628906,
"mask/share_step_conf": 0.1142728328704834,
"num_tokens": 8301871.0,
"reward": 1.0419251918792725,
"reward_std": 0.27103620767593384,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.5730335712432861,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.813461184501648,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.6559814214706421,
"adv/mean_abs_reasoning": 0.40428081154823303,
"adv/mean_abs_step_conf": 0.7389465570449829,
"adv/ratio_final_to_reasoning": 1.6225885640193924,
"adv/ratio_step_to_reasoning": 1.8278051689248238,
"adv/std_final_conf": 0.8501717448234558,
"adv/std_reasoning": 0.701352059841156,
"adv/std_step_conf": 0.9354451894760132,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6510141093474426,
"calib/avg_num_step_conf": 6.2421875,
"calib/ece": 0.20823293172690757,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9437751004016064,
"calib/gap": 0.024994708994708903,
"calib/mean_conf": 0.9596385542168674,
"calib/mu_c": 0.9656613756613757,
"calib/mu_w": 0.9406666666666668,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20441767068273087,
"calib/std_conf": 0.07641746131016414,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3287301587301587,
"calib/step_q_c_n": 1134.0,
"calib/step_q_gap": 0.021812055281882792,
"calib/step_q_w": 0.3069181034482759,
"calib/step_q_w_n": 464.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2940.0,
"completions/max_terminated_length": 2940.0,
"completions/mean_length": 504.05859375,
"completions/mean_terminated_length": 504.05859375,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.0384,
"grad_norm": 0.11191720515489578,
"kl": 0.09484481811523438,
"learning_rate": 4.555555555555556e-06,
"loss": 0.0777,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03631111979484558,
"mask/share_reasoning": 0.8209631443023682,
"mask/share_step_conf": 0.14272576570510864,
"num_tokens": 8533622.0,
"reward": 1.140600323677063,
"reward_std": 0.18435192108154297,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7588292956352234,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7867891788482666,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.6528252363204956,
"adv/mean_abs_reasoning": 0.3660427927970886,
"adv/mean_abs_step_conf": 0.7701745629310608,
"adv/ratio_final_to_reasoning": 1.783466985736778,
"adv/ratio_step_to_reasoning": 2.104056077831309,
"adv/std_final_conf": 0.8169144988059998,
"adv/std_reasoning": 0.6403437256813049,
"adv/std_step_conf": 0.9345270395278931,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7279202279202279,
"calib/avg_num_step_conf": 6.16796875,
"calib/ece": 0.40871485943775115,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.7710843373493976,
"calib/gap": 0.14789432789432788,
"calib/mean_conf": 0.8770682730923695,
"calib/mu_c": 0.9554700854700854,
"calib/mu_w": 0.8075757575757575,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.40795180722891583,
"calib/std_conf": 0.2362011925687789,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3431372549019608,
"calib/step_q_c_n": 612.0,
"calib/step_q_gap": 0.07033477299916857,
"calib/step_q_w": 0.2728024819027922,
"calib/step_q_w_n": 967.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3028.0,
"completions/max_terminated_length": 3028.0,
"completions/mean_length": 513.05078125,
"completions/mean_terminated_length": 519.1343994140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.061256974935531616,
"kl": 0.050884246826171875,
"learning_rate": 4.527777777777778e-06,
"loss": -0.0415,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.033428847789764404,
"mask/share_reasoning": 0.8275771141052246,
"mask/share_step_conf": 0.1272752583026886,
"num_tokens": 8772059.0,
"reward": 1.063904881477356,
"reward_std": 0.17878447473049164,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.5865042805671692,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8369119167327881,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.5637679696083069,
"adv/mean_abs_reasoning": 0.3914545178413391,
"adv/mean_abs_step_conf": 0.7495837211608887,
"adv/ratio_final_to_reasoning": 1.4401876690992959,
"adv/ratio_step_to_reasoning": 1.9148679782633224,
"adv/std_final_conf": 0.796682596206665,
"adv/std_reasoning": 0.6815966367721558,
"adv/std_step_conf": 0.9347833395004272,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6690769330988209,
"calib/avg_num_step_conf": 6.109375,
"calib/ece": 0.39092741935483877,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.8185483870967742,
"calib/gap": 0.11977916748094608,
"calib/mean_conf": 0.9041532258064516,
"calib/mu_c": 0.9616279069767443,
"calib/mu_w": 0.8418487394957982,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3874596774193549,
"calib/std_conf": 0.2034364267963304,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.38266757865937073,
"calib/step_q_c_n": 731.0,
"calib/step_q_gap": 0.09713336497389652,
"calib/step_q_w": 0.2855342136854742,
"calib/step_q_w_n": 833.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2929.0,
"completions/max_terminated_length": 2929.0,
"completions/mean_length": 510.79296875,
"completions/mean_terminated_length": 516.849853515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.06795842945575714,
"kl": 0.051410675048828125,
"learning_rate": 4.5e-06,
"loss": -0.0345,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.032852061092853546,
"mask/share_reasoning": 0.8267712593078613,
"mask/share_step_conf": 0.12865795195102692,
"num_tokens": 9009710.0,
"reward": 1.0843178033828735,
"reward_std": 0.1842365264892578,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6019449234008789,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8481062054634094,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.6071762442588806,
"adv/mean_abs_reasoning": 0.30981796979904175,
"adv/mean_abs_step_conf": 0.7679427862167358,
"adv/ratio_final_to_reasoning": 1.9597838196819743,
"adv/ratio_step_to_reasoning": 2.4786902667874595,
"adv/std_final_conf": 0.819245457649231,
"adv/std_reasoning": 0.5726991891860962,
"adv/std_step_conf": 0.9341544508934021,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6572,
"calib/avg_num_step_conf": 6.328125,
"calib/ece": 0.43850980392156863,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.7529411764705882,
"calib/gap": 0.05800923076923081,
"calib/mean_conf": 0.8866666666666666,
"calib/mu_c": 0.9162400000000002,
"calib/mu_w": 0.8582307692307694,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.41749019607843135,
"calib/std_conf": 0.21295738933693426,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.36389705882352946,
"calib/step_q_c_n": 680.0,
"calib/step_q_gap": 0.04426939924906137,
"calib/step_q_w": 0.3196276595744681,
"calib/step_q_w_n": 940.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2230.0,
"completions/max_terminated_length": 2230.0,
"completions/mean_length": 486.4375,
"completions/mean_terminated_length": 486.4375,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.0416,
"grad_norm": 0.05202677845954895,
"kl": 0.0638427734375,
"learning_rate": 4.472222222222223e-06,
"loss": -0.0256,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.035491589456796646,
"mask/share_reasoning": 0.8255130052566528,
"mask/share_step_conf": 0.13899537920951843,
"num_tokens": 9240326.0,
"reward": 1.0818815231323242,
"reward_std": 0.14773604273796082,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.5742976665496826,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8617267608642578,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.7261002063751221,
"adv/mean_abs_reasoning": 0.4546775817871094,
"adv/mean_abs_step_conf": 0.7494188547134399,
"adv/ratio_final_to_reasoning": 1.5969562508914263,
"adv/ratio_step_to_reasoning": 1.6482423693903063,
"adv/std_final_conf": 0.8997302651405334,
"adv/std_reasoning": 0.7204935550689697,
"adv/std_step_conf": 0.9343372583389282,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5853891698532132,
"calib/avg_num_step_conf": 5.7890625,
"calib/ece": 0.3858039215686273,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.08098556802763046,
"calib/mean_conf": 0.8352941176470589,
"calib/mu_c": 0.8778512396694216,
"calib/mu_w": 0.7968656716417911,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3732941176470587,
"calib/std_conf": 0.25733134923427275,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37328571428571433,
"calib/step_q_c_n": 630.0,
"calib/step_q_gap": 0.04505801475519794,
"calib/step_q_w": 0.3282276995305164,
"calib/step_q_w_n": 852.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1949.0,
"completions/max_terminated_length": 1949.0,
"completions/mean_length": 508.890625,
"completions/mean_terminated_length": 508.890625,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.08238881826400757,
"kl": 0.08835983276367188,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0775,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03551477566361427,
"mask/share_reasoning": 0.8364818692207336,
"mask/share_step_conf": 0.1280033439397812,
"num_tokens": 9477362.0,
"reward": 1.0873432159423828,
"reward_std": 0.21221011877059937,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.5923296809196472,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8590710759162903,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.6341361999511719,
"adv/mean_abs_reasoning": 0.41668474674224854,
"adv/mean_abs_step_conf": 0.7611067891120911,
"adv/ratio_final_to_reasoning": 1.5218608430210518,
"adv/ratio_step_to_reasoning": 1.8265770347069938,
"adv/std_final_conf": 0.8549427390098572,
"adv/std_reasoning": 0.6816285848617554,
"adv/std_step_conf": 0.9348737597465515,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7514878347628218,
"calib/avg_num_step_conf": 5.80859375,
"calib/ece": 0.1418823529411765,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5058823529411764,
"calib/gap": 0.25013127953789593,
"calib/mean_conf": 0.7325490196078431,
"calib/mu_c": 0.7894416243654822,
"calib/mu_w": 0.5393103448275862,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.050941176470588274,
"calib/std_conf": 0.30395976989517065,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.375041782729805,
"calib/step_q_c_n": 1077.0,
"calib/step_q_gap": 0.0862612949249269,
"calib/step_q_w": 0.2887804878048781,
"calib/step_q_w_n": 410.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2441.0,
"completions/max_terminated_length": 2441.0,
"completions/mean_length": 449.3359375,
"completions/mean_terminated_length": 449.3359375,
"completions/min_length": 126.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.116298146545887,
"kl": 0.1528167724609375,
"learning_rate": 4.416666666666667e-06,
"loss": -0.0102,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.039690546691417694,
"mask/share_reasoning": 0.8176549077033997,
"mask/share_step_conf": 0.14265450835227966,
"num_tokens": 9699640.0,
"reward": 1.2204053401947021,
"reward_std": 0.14665330946445465,
"rewards/accuracy_reward_step": 0.76953125,
"rewards/final_brier_reward_step": 0.8149999380111694,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8484572172164917,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.7516371011734009,
"adv/mean_abs_reasoning": 0.3156457245349884,
"adv/mean_abs_step_conf": 0.7655600309371948,
"adv/ratio_final_to_reasoning": 2.3812681203926274,
"adv/ratio_step_to_reasoning": 2.425377476805756,
"adv/std_final_conf": 0.9281405806541443,
"adv/std_reasoning": 0.596068799495697,
"adv/std_step_conf": 0.9343998432159424,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6625789669267931,
"calib/avg_num_step_conf": 5.7109375,
"calib/ece": 0.16611764705882343,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/gap": 0.16887588257153463,
"calib/mean_conf": 0.6492549019607843,
"calib/mu_c": 0.7267391304347826,
"calib/mu_w": 0.5578632478632479,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13709803921568617,
"calib/std_conf": 0.30193206891729246,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40450340136054425,
"calib/step_q_c_n": 735.0,
"calib/step_q_gap": 0.051050856656280175,
"calib/step_q_w": 0.35345254470426407,
"calib/step_q_w_n": 727.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3056.0,
"completions/max_terminated_length": 3056.0,
"completions/mean_length": 404.55078125,
"completions/mean_terminated_length": 404.55078125,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0448,
"grad_norm": 0.06288475543260574,
"kl": 0.09320068359375,
"learning_rate": 4.388888888888889e-06,
"loss": 0.0136,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04105367884039879,
"mask/share_reasoning": 0.8106446266174316,
"mask/share_step_conf": 0.14830169081687927,
"num_tokens": 9907573.0,
"reward": 1.1653730869293213,
"reward_std": 0.14880496263504028,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7298547029495239,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.862573504447937,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.772418737411499,
"adv/mean_abs_reasoning": 0.5696216821670532,
"adv/mean_abs_step_conf": 0.7576810121536255,
"adv/ratio_final_to_reasoning": 1.3560206038382006,
"adv/ratio_step_to_reasoning": 1.3301477732924851,
"adv/std_final_conf": 0.9291498064994812,
"adv/std_reasoning": 0.8097960352897644,
"adv/std_step_conf": 0.934786856174469,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6473269660133962,
"calib/avg_num_step_conf": 5.76953125,
"calib/ece": 0.12780392156862744,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.1450980392156863,
"calib/gap": 0.1366335896799803,
"calib/mean_conf": 0.5448235294117647,
"calib/mu_c": 0.6069784172661871,
"calib/mu_w": 0.47034482758620677,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06376470588235292,
"calib/std_conf": 0.2741615886685601,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42774018944519626,
"calib/step_q_c_n": 739.0,
"calib/step_q_gap": 0.05049086695197136,
"calib/step_q_w": 0.3772493224932249,
"calib/step_q_w_n": 738.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2090.0,
"completions/max_terminated_length": 2090.0,
"completions/mean_length": 462.4375,
"completions/mean_terminated_length": 464.2510070800781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.03930652514100075,
"kl": 0.065765380859375,
"learning_rate": 4.361111111111112e-06,
"loss": -0.0474,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03815210610628128,
"mask/share_reasoning": 0.822625994682312,
"mask/share_step_conf": 0.13531562685966492,
"num_tokens": 10131181.0,
"reward": 1.1641141176223755,
"reward_std": 0.1371372938156128,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.741721510887146,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8524627685546875,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.7673567533493042,
"adv/mean_abs_reasoning": 0.38849079608917236,
"adv/mean_abs_step_conf": 0.7680708169937134,
"adv/ratio_final_to_reasoning": 1.9752250531391449,
"adv/ratio_step_to_reasoning": 1.9770630983427828,
"adv/std_final_conf": 0.9358692765235901,
"adv/std_reasoning": 0.6814560294151306,
"adv/std_step_conf": 0.9348883032798767,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6887874676205747,
"calib/avg_num_step_conf": 6.54296875,
"calib/ece": 0.1507450980392156,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.17254901960784313,
"calib/gap": 0.18600037005057357,
"calib/mean_conf": 0.5256470588235295,
"calib/mu_c": 0.6233884297520661,
"calib/mu_w": 0.4373880597014925,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10094117647058819,
"calib/std_conf": 0.2883493811829746,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4323913043478261,
"calib/step_q_c_n": 736.0,
"calib/step_q_gap": 0.06701324257998792,
"calib/step_q_w": 0.36537806176783816,
"calib/step_q_w_n": 939.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1908.0,
"completions/max_terminated_length": 1908.0,
"completions/mean_length": 491.39453125,
"completions/mean_terminated_length": 493.32159423828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.03840534761548042,
"kl": 0.0656280517578125,
"learning_rate": 4.333333333333334e-06,
"loss": -0.0351,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03497948497533798,
"mask/share_reasoning": 0.8223901391029358,
"mask/share_step_conf": 0.13872411847114563,
"num_tokens": 10363298.0,
"reward": 1.1694731712341309,
"reward_std": 0.13373121619224548,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.7546882629394531,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8603387475013733,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.7848135828971863,
"adv/mean_abs_reasoning": 0.5568833351135254,
"adv/mean_abs_step_conf": 0.7529138326644897,
"adv/ratio_final_to_reasoning": 1.4092962267172089,
"adv/ratio_step_to_reasoning": 1.352013582002776,
"adv/std_final_conf": 0.9361516237258911,
"adv/std_reasoning": 0.7927802205085754,
"adv/std_step_conf": 0.9352006316184998,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6241544094830261,
"calib/avg_num_step_conf": 5.9453125,
"calib/ece": 0.14712598425196852,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.031496062992125984,
"calib/gap": 0.11007075032582392,
"calib/mean_conf": 0.4331102362204724,
"calib/mu_c": 0.48641221374045807,
"calib/mu_w": 0.37634146341463415,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.032244094488188976,
"calib/std_conf": 0.26554685006653567,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.424720744680851,
"calib/step_q_c_n": 752.0,
"calib/step_q_gap": 0.033590874550980865,
"calib/step_q_w": 0.39112987012987016,
"calib/step_q_w_n": 770.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2426.0,
"completions/max_terminated_length": 2426.0,
"completions/mean_length": 445.77734375,
"completions/mean_terminated_length": 445.77734375,
"completions/min_length": 112.0,
"completions/min_terminated_length": 112.0,
"epoch": 0.048,
"grad_norm": 0.03386177122592926,
"kl": 0.07482147216796875,
"learning_rate": 4.305555555555556e-06,
"loss": -0.0692,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04040045291185379,
"mask/share_reasoning": 0.8123257756233215,
"mask/share_step_conf": 0.14727374911308289,
"num_tokens": 10582465.0,
"reward": 1.1441757678985596,
"reward_std": 0.16143080592155457,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7184984683990479,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8465688824653625,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.8008900880813599,
"adv/mean_abs_reasoning": 0.49210262298583984,
"adv/mean_abs_step_conf": 0.7658165097236633,
"adv/ratio_final_to_reasoning": 1.627485915888738,
"adv/ratio_step_to_reasoning": 1.5562130213349818,
"adv/std_final_conf": 0.9361664652824402,
"adv/std_reasoning": 0.7392691969871521,
"adv/std_step_conf": 0.934954047203064,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5794730392156863,
"calib/avg_num_step_conf": 6.7890625,
"calib/ece": 0.1973046875,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.125,
"calib/gap": 0.08427941176470594,
"calib/mean_conf": 0.4615234375,
"calib/mu_c": 0.5010294117647058,
"calib/mu_w": 0.4167499999999999,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0637890625,
"calib/std_conf": 0.2845768916535276,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4298466981132075,
"calib/step_q_c_n": 848.0,
"calib/step_q_gap": 0.06432647339410641,
"calib/step_q_w": 0.3655202247191011,
"calib/step_q_w_n": 890.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2292.0,
"completions/max_terminated_length": 2292.0,
"completions/mean_length": 494.08203125,
"completions/mean_terminated_length": 496.0196228027344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.029355766251683235,
"kl": 0.07450103759765625,
"learning_rate": 4.277777777777778e-06,
"loss": -0.0534,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.039339788258075714,
"mask/share_reasoning": 0.8077389597892761,
"mask/share_step_conf": 0.14901497960090637,
"num_tokens": 10813718.0,
"reward": 1.141398310661316,
"reward_std": 0.1377015858888626,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7071058750152588,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8462938070297241,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.7672794461250305,
"adv/mean_abs_reasoning": 0.3217868208885193,
"adv/mean_abs_step_conf": 0.7579846382141113,
"adv/ratio_final_to_reasoning": 2.38443402997803,
"adv/ratio_step_to_reasoning": 2.35554904368414,
"adv/std_final_conf": 0.9359407424926758,
"adv/std_reasoning": 0.6402183175086975,
"adv/std_step_conf": 0.9345706105232239,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7161493477282951,
"calib/avg_num_step_conf": 6.41015625,
"calib/ece": 0.11127999999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.076,
"calib/gap": 0.19506586980271196,
"calib/mean_conf": 0.47103999999999996,
"calib/mu_c": 0.5623308270676692,
"calib/mu_w": 0.3672649572649573,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.02516,
"calib/std_conf": 0.2673434465252515,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4381193255512321,
"calib/step_q_c_n": 771.0,
"calib/step_q_gap": 0.060544612907553974,
"calib/step_q_w": 0.37757471264367815,
"calib/step_q_w_n": 870.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2668.0,
"completions/max_terminated_length": 2668.0,
"completions/mean_length": 503.60546875,
"completions/mean_terminated_length": 507.57086181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.04785727709531784,
"kl": 0.082183837890625,
"learning_rate": 4.25e-06,
"loss": -0.0998,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03499086946249008,
"mask/share_reasoning": 0.8182421326637268,
"mask/share_step_conf": 0.1389545202255249,
"num_tokens": 11048617.0,
"reward": 1.1591176986694336,
"reward_std": 0.15524102747440338,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.754852294921875,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8427762985229492,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.7918989658355713,
"adv/mean_abs_reasoning": 0.4984220564365387,
"adv/mean_abs_step_conf": 0.762143611907959,
"adv/ratio_final_to_reasoning": 1.5888120431452042,
"adv/ratio_step_to_reasoning": 1.5291129316324679,
"adv/std_final_conf": 0.9350493550300598,
"adv/std_reasoning": 0.7393231987953186,
"adv/std_step_conf": 0.9349328279495239,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6886497309473157,
"calib/avg_num_step_conf": 6.078125,
"calib/ece": 0.09735177865612649,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.08300395256916997,
"calib/gap": 0.1758290576899012,
"calib/mean_conf": 0.47158102766798415,
"calib/mu_c": 0.5626229508196722,
"calib/mu_w": 0.38679389312977097,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0433596837944664,
"calib/std_conf": 0.2618946442738743,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4343328335832084,
"calib/step_q_c_n": 667.0,
"calib/step_q_gap": 0.06582889657533436,
"calib/step_q_w": 0.36850393700787404,
"calib/step_q_w_n": 889.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2244.0,
"completions/max_terminated_length": 2244.0,
"completions/mean_length": 452.6953125,
"completions/mean_terminated_length": 454.4706115722656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 112.0,
"epoch": 0.0512,
"grad_norm": 0.030040910467505455,
"kl": 0.095123291015625,
"learning_rate": 4.222222222222223e-06,
"loss": -0.0465,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.039761632680892944,
"mask/share_reasoning": 0.809786319732666,
"mask/share_step_conf": 0.14654578268527985,
"num_tokens": 11268195.0,
"reward": 1.1682615280151367,
"reward_std": 0.13624170422554016,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.76040118932724,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8554352521896362,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.7727680206298828,
"adv/mean_abs_reasoning": 0.4392518997192383,
"adv/mean_abs_step_conf": 0.7774852514266968,
"adv/ratio_final_to_reasoning": 1.759282136568611,
"adv/ratio_step_to_reasoning": 1.7700213748048694,
"adv/std_final_conf": 0.9354192018508911,
"adv/std_reasoning": 0.7013979554176331,
"adv/std_step_conf": 0.9352362155914307,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7392572428543651,
"calib/avg_num_step_conf": 6.4375,
"calib/ece": 0.10184,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.168,
"calib/gap": 0.23718063387128147,
"calib/mean_conf": 0.55944,
"calib/mu_c": 0.6647482014388489,
"calib/mu_w": 0.4275675675675675,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05264000000000002,
"calib/std_conf": 0.28427818488234374,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4316390588235294,
"calib/step_q_c_n": 850.0,
"calib/step_q_gap": 0.07549419666814083,
"calib/step_q_w": 0.35614486215538854,
"calib/step_q_w_n": 798.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2992.0,
"completions/max_terminated_length": 2992.0,
"completions/mean_length": 503.390625,
"completions/mean_terminated_length": 503.390625,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.03872883692383766,
"kl": 0.0874481201171875,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0188,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03684543818235397,
"mask/share_reasoning": 0.8130807876586914,
"mask/share_step_conf": 0.15007378160953522,
"num_tokens": 11501599.0,
"reward": 1.1672701835632324,
"reward_std": 0.16787287592887878,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.770910918712616,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8398154377937317,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.7783648371696472,
"adv/mean_abs_reasoning": 0.4970937967300415,
"adv/mean_abs_step_conf": 0.7481029629707336,
"adv/ratio_final_to_reasoning": 1.5658309202203877,
"adv/ratio_step_to_reasoning": 1.504953326498678,
"adv/std_final_conf": 0.9354126453399658,
"adv/std_reasoning": 0.7394469380378723,
"adv/std_step_conf": 0.9351540207862854,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.737333859718384,
"calib/avg_num_step_conf": 6.91796875,
"calib/ece": 0.10685258964143424,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.199203187250996,
"calib/gap": 0.2498032635873142,
"calib/mean_conf": 0.5972111553784861,
"calib/mu_c": 0.6987248322147651,
"calib/mu_w": 0.4489215686274509,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.05521912350597606,
"calib/std_conf": 0.29781306343418107,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4361684460260973,
"calib/step_q_c_n": 843.0,
"calib/step_q_gap": 0.11400249775023519,
"calib/step_q_w": 0.3221659482758621,
"calib/step_q_w_n": 928.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2590.0,
"completions/max_terminated_length": 2590.0,
"completions/mean_length": 497.34765625,
"completions/mean_terminated_length": 501.2637634277344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.05003435164690018,
"kl": 0.08087158203125,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0706,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.036934275180101395,
"mask/share_reasoning": 0.8138121962547302,
"mask/share_step_conf": 0.14144101738929749,
"num_tokens": 11734280.0,
"reward": 1.1801552772521973,
"reward_std": 0.19399115443229675,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7727367281913757,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8505699038505554,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.7511352300643921,
"adv/mean_abs_reasoning": 0.36320850253105164,
"adv/mean_abs_step_conf": 0.7650404572486877,
"adv/ratio_final_to_reasoning": 2.06805519372492,
"adv/ratio_step_to_reasoning": 2.1063396146219966,
"adv/std_final_conf": 0.9327057003974915,
"adv/std_reasoning": 0.6813850402832031,
"adv/std_step_conf": 0.9352177977561951,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8028094105869102,
"calib/avg_num_step_conf": 6.1328125,
"calib/ece": 0.15687747035573124,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2766798418972332,
"calib/gap": 0.298421349017645,
"calib/mean_conf": 0.6408300395256917,
"calib/mu_c": 0.784732824427481,
"calib/mu_w": 0.48631147540983605,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13996047430830044,
"calib/std_conf": 0.30277586442427024,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.38387305699481866,
"calib/step_q_c_n": 772.0,
"calib/step_q_gap": 0.05847205448855297,
"calib/step_q_w": 0.3254010025062657,
"calib/step_q_w_n": 798.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2652.0,
"completions/max_terminated_length": 2652.0,
"completions/mean_length": 476.37109375,
"completions/mean_terminated_length": 478.2392578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.0544,
"grad_norm": 0.03629736974835396,
"kl": 0.08521270751953125,
"learning_rate": 4.138888888888889e-06,
"loss": -0.0354,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03722125664353371,
"mask/share_reasoning": 0.8207862377166748,
"mask/share_step_conf": 0.1380862295627594,
"num_tokens": 11965527.0,
"reward": 1.1790621280670166,
"reward_std": 0.16613981127738953,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7832379341125488,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8499242067337036,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.7443599104881287,
"adv/mean_abs_reasoning": 0.4907376766204834,
"adv/mean_abs_step_conf": 0.7508261203765869,
"adv/ratio_final_to_reasoning": 1.5168183450152868,
"adv/ratio_step_to_reasoning": 1.5299948549849074,
"adv/std_final_conf": 0.9310631155967712,
"adv/std_reasoning": 0.7392945289611816,
"adv/std_step_conf": 0.9352546334266663,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7442952527698291,
"calib/avg_num_step_conf": 5.84375,
"calib/ece": 0.1342913385826772,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.452755905511811,
"calib/gap": 0.29940494533714884,
"calib/mean_conf": 0.7053937007874016,
"calib/mu_c": 0.7961581920903956,
"calib/mu_w": 0.4967532467532468,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.0714173228346457,
"calib/std_conf": 0.32899818484242627,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3782980866062437,
"calib/step_q_c_n": 993.0,
"calib/step_q_gap": 0.07326826553268506,
"calib/step_q_w": 0.30502982107355864,
"calib/step_q_w_n": 503.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2045.0,
"completions/max_terminated_length": 2045.0,
"completions/mean_length": 450.37109375,
"completions/mean_terminated_length": 450.37109375,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.040435388684272766,
"kl": 0.090972900390625,
"learning_rate": 4.111111111111111e-06,
"loss": -0.0395,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.037771232426166534,
"mask/share_reasoning": 0.821386992931366,
"mask/share_step_conf": 0.1408417671918869,
"num_tokens": 12188774.0,
"reward": 1.20395827293396,
"reward_std": 0.17617203295230865,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.8006316423416138,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.847044050693512,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.7123820781707764,
"adv/mean_abs_reasoning": 0.3875897526741028,
"adv/mean_abs_step_conf": 0.7608627080917358,
"adv/ratio_final_to_reasoning": 1.8379796505346953,
"adv/ratio_step_to_reasoning": 1.9630619820114086,
"adv/std_final_conf": 0.8869374394416809,
"adv/std_reasoning": 0.6815102100372314,
"adv/std_step_conf": 0.934977114200592,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7330726621049202,
"calib/avg_num_step_conf": 6.8515625,
"calib/ece": 0.22224409448818894,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6377952755905512,
"calib/gap": 0.24778820462691442,
"calib/mean_conf": 0.8262598425196851,
"calib/mu_c": 0.9228387096774194,
"calib/mu_w": 0.675050505050505,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21913385826771647,
"calib/std_conf": 0.2570283452795441,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4098411122144985,
"calib/step_q_c_n": 1007.0,
"calib/step_q_gap": 0.1232012193095453,
"calib/step_q_w": 0.2866398929049532,
"calib/step_q_w_n": 747.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2443.0,
"completions/max_terminated_length": 2443.0,
"completions/mean_length": 501.37109375,
"completions/mean_terminated_length": 501.37109375,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.032593466341495514,
"kl": 0.0872955322265625,
"learning_rate": 4.083333333333334e-06,
"loss": -0.0127,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03349916636943817,
"mask/share_reasoning": 0.8224176168441772,
"mask/share_step_conf": 0.1440831869840622,
"num_tokens": 12422949.0,
"reward": 1.1682567596435547,
"reward_std": 0.18513035774230957,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7612996101379395,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8376426100730896,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.6633133888244629,
"adv/mean_abs_reasoning": 0.4657354950904846,
"adv/mean_abs_step_conf": 0.7782049179077148,
"adv/ratio_final_to_reasoning": 1.4242276910751503,
"adv/ratio_step_to_reasoning": 1.670916058816867,
"adv/std_final_conf": 0.8949787616729736,
"adv/std_reasoning": 0.7574234008789062,
"adv/std_step_conf": 0.9352843761444092,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.697829131652661,
"calib/avg_num_step_conf": 5.8203125,
"calib/ece": 0.20271653543307086,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7165354330708661,
"calib/gap": 0.2250952380952379,
"calib/mean_conf": 0.8525590551181103,
"calib/mu_c": 0.9269999999999998,
"calib/mu_w": 0.7019047619047619,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19299212598425197,
"calib/std_conf": 0.2707356310971349,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4095027027027027,
"calib/step_q_c_n": 925.0,
"calib/step_q_gap": 0.08858234872040183,
"calib/step_q_w": 0.3209203539823009,
"calib/step_q_w_n": 565.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1822.0,
"completions/max_terminated_length": 1822.0,
"completions/mean_length": 422.12890625,
"completions/mean_terminated_length": 423.7843322753906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 95.0,
"epoch": 0.0576,
"grad_norm": 0.03692952170968056,
"kl": 0.1038055419921875,
"learning_rate": 4.055555555555556e-06,
"loss": -0.0597,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03953488916158676,
"mask/share_reasoning": 0.8116753101348877,
"mask/share_step_conf": 0.14488358795642853,
"num_tokens": 12637246.0,
"reward": 1.1712415218353271,
"reward_std": 0.20544785261154175,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7653933763504028,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8305597901344299,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.754909098148346,
"adv/mean_abs_reasoning": 0.5777817964553833,
"adv/mean_abs_step_conf": 0.7564443349838257,
"adv/ratio_final_to_reasoning": 1.306564351420581,
"adv/ratio_step_to_reasoning": 1.3092214736160155,
"adv/std_final_conf": 0.9008998274803162,
"adv/std_reasoning": 0.7928237318992615,
"adv/std_step_conf": 0.9352696537971497,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7237327755905512,
"calib/avg_num_step_conf": 6.09375,
"calib/ece": 0.3145882352941177,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6313725490196078,
"calib/gap": 0.23964751476377943,
"calib/mean_conf": 0.7945882352941177,
"calib/mu_c": 0.9148818897637795,
"calib/mu_w": 0.675234375,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.30556862745098046,
"calib/std_conf": 0.3087300474759994,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3964864864864865,
"calib/step_q_c_n": 703.0,
"calib/step_q_gap": 0.08968368601974203,
"calib/step_q_w": 0.30680280046674446,
"calib/step_q_w_n": 857.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2059.0,
"completions/max_terminated_length": 2059.0,
"completions/mean_length": 473.15234375,
"completions/mean_terminated_length": 475.00787353515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.03538893908262253,
"kl": 0.0989227294921875,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0235,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.036335572600364685,
"mask/share_reasoning": 0.8202430009841919,
"mask/share_step_conf": 0.13951516151428223,
"num_tokens": 12866197.0,
"reward": 1.1363887786865234,
"reward_std": 0.23207849264144897,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6799824237823486,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8634260296821594,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.6986392140388489,
"adv/mean_abs_reasoning": 0.43160390853881836,
"adv/mean_abs_step_conf": 0.735781192779541,
"adv/ratio_final_to_reasoning": 1.6187045580844488,
"adv/ratio_step_to_reasoning": 1.7047602633407686,
"adv/std_final_conf": 0.8800408244132996,
"adv/std_reasoning": 0.7013711333274841,
"adv/std_step_conf": 0.9352052807807922,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7471084040851482,
"calib/avg_num_step_conf": 6.62890625,
"calib/ece": 0.3129803921568628,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6431372549019608,
"calib/gap": 0.24679032853451455,
"calib/mean_conf": 0.7961960784313724,
"calib/mu_c": 0.9181395348837209,
"calib/mu_w": 0.6713492063492064,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30164705882352943,
"calib/std_conf": 0.3198603261111253,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3863864491844417,
"calib/step_q_c_n": 797.0,
"calib/step_q_gap": 0.09199756029555278,
"calib/step_q_w": 0.2943888888888889,
"calib/step_q_w_n": 900.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2321.0,
"completions/max_terminated_length": 2321.0,
"completions/mean_length": 500.06640625,
"completions/mean_terminated_length": 502.0274658203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.038583461195230484,
"kl": 0.0832672119140625,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0199,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.035010527819395065,
"mask/share_reasoning": 0.8196976184844971,
"mask/share_step_conf": 0.14138558506965637,
"num_tokens": 13101054.0,
"reward": 1.1343872547149658,
"reward_std": 0.18083718419075012,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6805593967437744,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8593308329582214,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.646998941898346,
"adv/mean_abs_reasoning": 0.4985702335834503,
"adv/mean_abs_step_conf": 0.7729321718215942,
"adv/ratio_final_to_reasoning": 1.2977087245022054,
"adv/ratio_step_to_reasoning": 1.5502974701601824,
"adv/std_final_conf": 0.8325883150100708,
"adv/std_reasoning": 0.7393158674240112,
"adv/std_step_conf": 0.9350900650024414,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7011326860841424,
"calib/avg_num_step_conf": 6.5703125,
"calib/ece": 0.2856521739130435,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7549407114624506,
"calib/gap": 0.17861229773462772,
"calib/mean_conf": 0.8607509881422926,
"calib/mu_c": 0.9334666666666666,
"calib/mu_w": 0.7548543689320388,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2767588932806324,
"calib/std_conf": 0.26906922946717476,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.38264637002341917,
"calib/step_q_c_n": 854.0,
"calib/step_q_gap": 0.09376955842921625,
"calib/step_q_w": 0.2888768115942029,
"calib/step_q_w_n": 828.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2945.0,
"completions/max_terminated_length": 2945.0,
"completions/mean_length": 489.18359375,
"completions/mean_terminated_length": 491.10198974609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.0608,
"grad_norm": 0.03516416251659393,
"kl": 0.0896453857421875,
"learning_rate": 3.972222222222223e-06,
"loss": 0.0234,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03536321222782135,
"mask/share_reasoning": 0.8221592903137207,
"mask/share_step_conf": 0.13857123255729675,
"num_tokens": 13333077.0,
"reward": 1.1429297924041748,
"reward_std": 0.19141796231269836,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6924902200698853,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8523504734039307,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.7294912338256836,
"adv/mean_abs_reasoning": 0.6277225613594055,
"adv/mean_abs_step_conf": 0.7514354586601257,
"adv/ratio_final_to_reasoning": 1.162123649412706,
"adv/ratio_step_to_reasoning": 1.1970821265891824,
"adv/std_final_conf": 0.9116400480270386,
"adv/std_reasoning": 0.843001663684845,
"adv/std_step_conf": 0.9354155659675598,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6327699891172142,
"calib/avg_num_step_conf": 7.890625,
"calib/ece": 0.32192,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.54,
"calib/gap": 0.14120478842583717,
"calib/mean_conf": 0.76152,
"calib/mu_c": 0.8332520325203252,
"calib/mu_w": 0.692047244094488,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.29572,
"calib/std_conf": 0.3228474711067132,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.34553341148886285,
"calib/step_q_c_n": 853.0,
"calib/step_q_gap": 0.07273992391388429,
"calib/step_q_w": 0.27279348757497857,
"calib/step_q_w_n": 1167.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2570.0,
"completions/max_terminated_length": 2570.0,
"completions/mean_length": 583.38671875,
"completions/mean_terminated_length": 585.674560546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.027154145762324333,
"kl": 0.10162353515625,
"learning_rate": 3.944444444444445e-06,
"loss": 0.0132,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.031217699870467186,
"mask/share_reasoning": 0.8236774206161499,
"mask/share_step_conf": 0.14119866490364075,
"num_tokens": 13588744.0,
"reward": 1.077712059020996,
"reward_std": 0.24210575222969055,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.6262355446815491,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8257089257240295,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.5891823172569275,
"adv/mean_abs_reasoning": 0.47933292388916016,
"adv/mean_abs_step_conf": 0.7589001655578613,
"adv/ratio_final_to_reasoning": 1.2291713919346143,
"adv/ratio_step_to_reasoning": 1.5832423097507644,
"adv/std_final_conf": 0.8096221685409546,
"adv/std_reasoning": 0.7207550406455994,
"adv/std_step_conf": 0.9351330399513245,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6594375123786889,
"calib/avg_num_step_conf": 6.453125,
"calib/ece": 0.27825396825396814,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6626984126984127,
"calib/gap": 0.15903743315508012,
"calib/mean_conf": 0.8048412698412698,
"calib/mu_c": 0.8673202614379084,
"calib/mu_w": 0.7082828282828283,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23797619047619037,
"calib/std_conf": 0.3100940311229322,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3749768844221105,
"calib/step_q_c_n": 995.0,
"calib/step_q_gap": 0.04001493617249108,
"calib/step_q_w": 0.33496194824961945,
"calib/step_q_w_n": 657.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2988.0,
"completions/max_terminated_length": 2988.0,
"completions/mean_length": 513.859375,
"completions/mean_terminated_length": 517.905517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.03092842549085617,
"kl": 0.10501861572265625,
"learning_rate": 3.916666666666667e-06,
"loss": -0.0668,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03509576991200447,
"mask/share_reasoning": 0.8163177371025085,
"mask/share_step_conf": 0.1407739818096161,
"num_tokens": 13826540.0,
"reward": 1.132265567779541,
"reward_std": 0.19192655384540558,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6911335587501526,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8379942774772644,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.6356889009475708,
"adv/mean_abs_reasoning": 0.5132553577423096,
"adv/mean_abs_step_conf": 0.7455482482910156,
"adv/ratio_final_to_reasoning": 1.238543137170195,
"adv/ratio_step_to_reasoning": 1.452587366200147,
"adv/std_final_conf": 0.876144528388977,
"adv/std_reasoning": 0.7927860021591187,
"adv/std_step_conf": 0.9351310729980469,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7157082479508197,
"calib/avg_num_step_conf": 6.9609375,
"calib/ece": 0.29088,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.644,
"calib/gap": 0.2622540983606557,
"calib/mean_conf": 0.77952,
"calib/mu_c": 0.9075,
"calib/mu_w": 0.6452459016393443,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2792,
"calib/std_conf": 0.3344616115490685,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.39738062755798087,
"calib/step_q_c_n": 733.0,
"calib/step_q_gap": 0.11975431678581688,
"calib/step_q_w": 0.277626310772164,
"calib/step_q_w_n": 1049.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2950.0,
"completions/max_terminated_length": 2950.0,
"completions/mean_length": 497.171875,
"completions/mean_terminated_length": 501.08660888671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.064,
"grad_norm": 0.0763402134180069,
"kl": 0.0853118896484375,
"learning_rate": 3.88888888888889e-06,
"loss": 0.0266,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03505894914269447,
"mask/share_reasoning": 0.8122283816337585,
"mask/share_step_conf": 0.1449001580476761,
"num_tokens": 14062672.0,
"reward": 1.1305547952651978,
"reward_std": 0.24602213501930237,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6814101338386536,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.856257975101471,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.6465821266174316,
"adv/mean_abs_reasoning": 0.5136411190032959,
"adv/mean_abs_step_conf": 0.7517024278640747,
"adv/ratio_final_to_reasoning": 1.258820804440469,
"adv/ratio_step_to_reasoning": 1.463477903254181,
"adv/std_final_conf": 0.8604548573493958,
"adv/std_reasoning": 0.7753183841705322,
"adv/std_step_conf": 0.9348891973495483,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6626819126819127,
"calib/avg_num_step_conf": 6.1796875,
"calib/ece": 0.26952380952380944,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6706349206349206,
"calib/gap": 0.1926559251559249,
"calib/mean_conf": 0.821031746031746,
"calib/mu_c": 0.9005405405405403,
"calib/mu_w": 0.7078846153846154,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25162698412698403,
"calib/std_conf": 0.29678657118114243,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4120119760479042,
"calib/step_q_c_n": 835.0,
"calib/step_q_gap": 0.11402000817641822,
"calib/step_q_w": 0.29799196787148596,
"calib/step_q_w_n": 747.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2402.0,
"completions/max_terminated_length": 2402.0,
"completions/mean_length": 431.1953125,
"completions/mean_terminated_length": 431.1953125,
"completions/min_length": 125.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.03931074216961861,
"kl": 0.09100341796875,
"learning_rate": 3.861111111111112e-06,
"loss": 0.0669,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04118172079324722,
"mask/share_reasoning": 0.8074016571044922,
"mask/share_step_conf": 0.1514166295528412,
"num_tokens": 14277122.0,
"reward": 1.138131856918335,
"reward_std": 0.20521759986877441,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6933664083480835,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8474522829055786,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.7413532137870789,
"adv/mean_abs_reasoning": 0.6082359552383423,
"adv/mean_abs_step_conf": 0.7599615454673767,
"adv/ratio_final_to_reasoning": 1.218857924136651,
"adv/ratio_step_to_reasoning": 1.2494518597960549,
"adv/std_final_conf": 0.8915730118751526,
"adv/std_reasoning": 0.7930753827095032,
"adv/std_step_conf": 0.9355272054672241,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6220703124999999,
"calib/avg_num_step_conf": 7.06640625,
"calib/ece": 0.24818548387096775,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.41935483870967744,
"calib/gap": 0.17193229166666668,
"calib/mean_conf": 0.6165725806451613,
"calib/mu_c": 0.699765625,
"calib/mu_w": 0.5278333333333333,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1743145161290323,
"calib/std_conf": 0.37285184018863304,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3638804220398593,
"calib/step_q_c_n": 853.0,
"calib/step_q_gap": 0.05817958521977562,
"calib/step_q_w": 0.3057008368200837,
"calib/step_q_w_n": 956.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2701.0,
"completions/max_terminated_length": 2701.0,
"completions/mean_length": 519.65234375,
"completions/mean_terminated_length": 523.7440795898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.041193604469299316,
"kl": 0.19834136962890625,
"learning_rate": 3.833333333333334e-06,
"loss": 0.0184,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.033691082149744034,
"mask/share_reasoning": 0.8175060749053955,
"mask/share_step_conf": 0.14099037647247314,
"num_tokens": 14517233.0,
"reward": 1.0956416130065918,
"reward_std": 0.2342512458562851,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6636785268783569,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8230906128883362,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.7436953186988831,
"adv/mean_abs_reasoning": 0.5573487281799316,
"adv/mean_abs_step_conf": 0.7506574392318726,
"adv/ratio_final_to_reasoning": 1.334344694976665,
"adv/ratio_step_to_reasoning": 1.3468361929939385,
"adv/std_final_conf": 0.9340667128562927,
"adv/std_reasoning": 0.775478184223175,
"adv/std_step_conf": 0.935070276260376,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8093303713106952,
"calib/avg_num_step_conf": 6.62109375,
"calib/ece": 0.10964285714285713,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.2857142857142857,
"calib/gap": 0.39582164392256414,
"calib/mean_conf": 0.5507539682539684,
"calib/mu_c": 0.7313868613138685,
"calib/mu_w": 0.3355652173913044,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05837301587301586,
"calib/std_conf": 0.3528925639491461,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.39803337306317044,
"calib/step_q_c_n": 839.0,
"calib/step_q_gap": 0.10076701792298354,
"calib/step_q_w": 0.2972663551401869,
"calib/step_q_w_n": 856.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2231.0,
"completions/max_terminated_length": 2231.0,
"completions/mean_length": 544.3828125,
"completions/mean_terminated_length": 548.6693115234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.0672,
"grad_norm": 0.07931511104106903,
"kl": 0.0871429443359375,
"learning_rate": 3.8055555555555556e-06,
"loss": -0.0398,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03265227749943733,
"mask/share_reasoning": 0.8248114585876465,
"mask/share_step_conf": 0.1347237527370453,
"num_tokens": 14765235.0,
"reward": 1.2072186470031738,
"reward_std": 0.1769953966140747,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.8108534812927246,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8664516806602478,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.7594350576400757,
"adv/mean_abs_reasoning": 0.6033494472503662,
"adv/mean_abs_step_conf": 0.7881118655204773,
"adv/ratio_final_to_reasoning": 1.2586985222262748,
"adv/ratio_step_to_reasoning": 1.3062278736014852,
"adv/std_final_conf": 0.9346402883529663,
"adv/std_reasoning": 0.8266748189926147,
"adv/std_step_conf": 0.9354366660118103,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6618489037843877,
"calib/avg_num_step_conf": 6.3359375,
"calib/ece": 0.21761133603238866,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.2591093117408907,
"calib/gap": 0.19151096215612334,
"calib/mean_conf": 0.5554251012145749,
"calib/mu_c": 0.6275324675324675,
"calib/mu_w": 0.43602150537634415,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07477732793522268,
"calib/std_conf": 0.34017339210741376,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.38183770883054896,
"calib/step_q_c_n": 838.0,
"calib/step_q_gap": 0.0992611782183041,
"calib/step_q_w": 0.28257653061224486,
"calib/step_q_w_n": 784.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2924.0,
"completions/max_terminated_length": 2924.0,
"completions/mean_length": 498.33203125,
"completions/mean_terminated_length": 502.2558898925781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.051069580018520355,
"kl": 0.0894927978515625,
"learning_rate": 3.777777777777778e-06,
"loss": 0.0888,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03600117191672325,
"mask/share_reasoning": 0.822626531124115,
"mask/share_step_conf": 0.13355976343154907,
"num_tokens": 14996584.0,
"reward": 1.1186976432800293,
"reward_std": 0.21334236860275269,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7089800834655762,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8100893497467041,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.7145624160766602,
"adv/mean_abs_reasoning": 0.4103482961654663,
"adv/mean_abs_step_conf": 0.7509387731552124,
"adv/ratio_final_to_reasoning": 1.7413558743973057,
"adv/ratio_step_to_reasoning": 1.8300033902234323,
"adv/std_final_conf": 0.8932875394821167,
"adv/std_reasoning": 0.6815477609634399,
"adv/std_step_conf": 0.9347946643829346,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8041538461538461,
"calib/avg_num_step_conf": 5.64453125,
"calib/ece": 0.18450980392156868,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.36470588235294116,
"calib/gap": 0.3085692307692308,
"calib/mean_conf": 0.6925098039215688,
"calib/mu_c": 0.8437692307692307,
"calib/mu_w": 0.5351999999999999,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18360784313725498,
"calib/std_conf": 0.3136064332916695,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4357640750670241,
"calib/step_q_c_n": 746.0,
"calib/step_q_gap": 0.05981271598261778,
"calib/step_q_w": 0.3759513590844063,
"calib/step_q_w_n": 699.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2419.0,
"completions/max_terminated_length": 2419.0,
"completions/mean_length": 401.5703125,
"completions/mean_terminated_length": 401.5703125,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.03955350071191788,
"kl": 0.090484619140625,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0301,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.041694652289152145,
"mask/share_reasoning": 0.8116481900215149,
"mask/share_step_conf": 0.14665712416172028,
"num_tokens": 15204410.0,
"reward": 1.1743881702423096,
"reward_std": 0.1349031776189804,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7695730924606323,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8522812128067017,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.7148923277854919,
"adv/mean_abs_reasoning": 0.46863657236099243,
"adv/mean_abs_step_conf": 0.7454225420951843,
"adv/ratio_final_to_reasoning": 1.525472764927121,
"adv/ratio_step_to_reasoning": 1.590619652964222,
"adv/std_final_conf": 0.9109669327735901,
"adv/std_reasoning": 0.7575206756591797,
"adv/std_step_conf": 0.9346955418586731,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7966229508196722,
"calib/avg_num_step_conf": 7.65234375,
"calib/ece": 0.08914979757085015,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.242914979757085,
"calib/gap": 0.36650950819672135,
"calib/mean_conf": 0.534251012145749,
"calib/mu_c": 0.71528,
"calib/mu_w": 0.3487704918032787,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05866396761133601,
"calib/std_conf": 0.33597544515327177,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40850253807106596,
"calib/step_q_c_n": 788.0,
"calib/step_q_gap": 0.1352019402913905,
"calib/step_q_w": 0.27330059777967547,
"calib/step_q_w_n": 1171.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2772.0,
"completions/max_terminated_length": 2772.0,
"completions/mean_length": 598.10546875,
"completions/mean_terminated_length": 598.10546875,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.0704,
"grad_norm": 0.03529307246208191,
"kl": 0.0828857421875,
"learning_rate": 3.7222222222222225e-06,
"loss": 0.1242,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.032410524785518646,
"mask/share_reasoning": 0.8293423652648926,
"mask/share_step_conf": 0.13824716210365295,
"num_tokens": 15463877.0,
"reward": 1.1834068298339844,
"reward_std": 0.1921721249818802,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.790777325630188,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8569409847259521,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.7141172885894775,
"adv/mean_abs_reasoning": 0.38203346729278564,
"adv/mean_abs_step_conf": 0.7260956764221191,
"adv/ratio_final_to_reasoning": 1.869253219226961,
"adv/ratio_step_to_reasoning": 1.9006075084663945,
"adv/std_final_conf": 0.9237503409385681,
"adv/std_reasoning": 0.6815032958984375,
"adv/std_step_conf": 0.9345933794975281,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7906249999999999,
"calib/avg_num_step_conf": 6.69921875,
"calib/ece": 0.09826086956521743,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4031620553359684,
"calib/gap": 0.36772513440860205,
"calib/mean_conf": 0.6520158102766799,
"calib/mu_c": 0.7871874999999999,
"calib/mu_w": 0.4194623655913979,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.058932806324110736,
"calib/std_conf": 0.3431432935497034,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4334216101694915,
"calib/step_q_c_n": 944.0,
"calib/step_q_gap": 0.1032270576403086,
"calib/step_q_w": 0.3301945525291829,
"calib/step_q_w_n": 771.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2119.0,
"completions/max_terminated_length": 2119.0,
"completions/mean_length": 489.15625,
"completions/mean_terminated_length": 491.07452392578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.061680082231760025,
"kl": 0.088104248046875,
"learning_rate": 3.694444444444445e-06,
"loss": -0.0804,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03602088615298271,
"mask/share_reasoning": 0.8216089010238647,
"mask/share_step_conf": 0.13846397399902344,
"num_tokens": 15694109.0,
"reward": 1.2258788347244263,
"reward_std": 0.1442015916109085,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.8107554316520691,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8788971900939941,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.6820115447044373,
"adv/mean_abs_reasoning": 0.46427327394485474,
"adv/mean_abs_step_conf": 0.7704893350601196,
"adv/ratio_final_to_reasoning": 1.4689873033386043,
"adv/ratio_step_to_reasoning": 1.6595599581113007,
"adv/std_final_conf": 0.8864242434501648,
"adv/std_reasoning": 0.7391756772994995,
"adv/std_step_conf": 0.9349017143249512,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7717391304347826,
"calib/avg_num_step_conf": 6.6484375,
"calib/ece": 0.1602777777777778,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.43253968253968256,
"calib/gap": 0.3402326468344775,
"calib/mean_conf": 0.7078968253968254,
"calib/mu_c": 0.8618115942028985,
"calib/mu_w": 0.521578947368421,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1602777777777778,
"calib/std_conf": 0.32516531398427684,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5014229765013054,
"calib/step_q_c_n": 766.0,
"calib/step_q_gap": 0.1717328055611344,
"calib/step_q_w": 0.329690170940171,
"calib/step_q_w_n": 936.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2515.0,
"completions/max_terminated_length": 2515.0,
"completions/mean_length": 493.16015625,
"completions/mean_terminated_length": 493.16015625,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.04005345329642296,
"kl": 0.0855255126953125,
"learning_rate": 3.6666666666666666e-06,
"loss": -0.0084,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.036960065364837646,
"mask/share_reasoning": 0.8169299364089966,
"mask/share_step_conf": 0.14610998332500458,
"num_tokens": 15924446.0,
"reward": 1.1919140815734863,
"reward_std": 0.18502911925315857,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7743504047393799,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8709017038345337,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.7154502868652344,
"adv/mean_abs_reasoning": 0.46667271852493286,
"adv/mean_abs_step_conf": 0.7458276748657227,
"adv/ratio_final_to_reasoning": 1.5330878760743545,
"adv/ratio_step_to_reasoning": 1.598181434781848,
"adv/std_final_conf": 0.9064786434173584,
"adv/std_reasoning": 0.7393455505371094,
"adv/std_step_conf": 0.9349368810653687,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7176014854654885,
"calib/avg_num_step_conf": 6.6953125,
"calib/ece": 0.22788844621513943,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.36254980079681276,
"calib/gap": 0.26841208861569976,
"calib/mean_conf": 0.6157768924302789,
"calib/mu_c": 0.7622807017543859,
"calib/mu_w": 0.49386861313868613,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1947410358565737,
"calib/std_conf": 0.3426026673469013,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4538686131386861,
"calib/step_q_c_n": 685.0,
"calib/step_q_gap": 0.09047697076745187,
"calib/step_q_w": 0.3633916423712342,
"calib/step_q_w_n": 1029.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2790.0,
"completions/max_terminated_length": 2790.0,
"completions/mean_length": 552.44140625,
"completions/mean_terminated_length": 554.6078491210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.0736,
"grad_norm": 0.06692394614219666,
"kl": 0.24575042724609375,
"learning_rate": 3.638888888888889e-06,
"loss": 0.0174,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.032987501472234726,
"mask/share_reasoning": 0.831560492515564,
"mask/share_step_conf": 0.131545752286911,
"num_tokens": 16170367.0,
"reward": 1.145660638809204,
"reward_std": 0.21713489294052124,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.7272031307220459,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8526413440704346,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.7055125832557678,
"adv/mean_abs_reasoning": 0.48625022172927856,
"adv/mean_abs_step_conf": 0.7672015428543091,
"adv/ratio_final_to_reasoning": 1.4509249594719245,
"adv/ratio_step_to_reasoning": 1.5777916565793384,
"adv/std_final_conf": 0.903593122959137,
"adv/std_reasoning": 0.7575156688690186,
"adv/std_step_conf": 0.9353858828544617,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.8324080267558528,
"calib/avg_num_step_conf": 7.1328125,
"calib/ece": 0.19293877551020416,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.4816326530612245,
"calib/gap": 0.42121070234113706,
"calib/mean_conf": 0.6623265306122449,
"calib/mu_c": 0.8858260869565217,
"calib/mu_w": 0.4646153846153846,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.19293877551020416,
"calib/std_conf": 0.36361099351126486,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5061911764705882,
"calib/step_q_c_n": 680.0,
"calib/step_q_gap": 0.1676920490709372,
"calib/step_q_w": 0.338499127399651,
"calib/step_q_w_n": 1146.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2419.0,
"completions/max_terminated_length": 2419.0,
"completions/mean_length": 524.58203125,
"completions/mean_terminated_length": 530.8023681640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.05342242121696472,
"kl": 0.12483978271484375,
"learning_rate": 3.6111111111111115e-06,
"loss": -0.0851,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03515394777059555,
"mask/share_reasoning": 0.8060823678970337,
"mask/share_step_conf": 0.14704498648643494,
"num_tokens": 16411652.0,
"reward": 1.1459590196609497,
"reward_std": 0.2323385775089264,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.7573128938674927,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8355700373649597,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.7016242742538452,
"adv/mean_abs_reasoning": 0.5792092084884644,
"adv/mean_abs_step_conf": 0.7449058294296265,
"adv/ratio_final_to_reasoning": 1.2113486180318884,
"adv/ratio_step_to_reasoning": 1.286073871949607,
"adv/std_final_conf": 0.8915162086486816,
"adv/std_reasoning": 0.8265349864959717,
"adv/std_step_conf": 0.9350942373275757,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6868696727178085,
"calib/avg_num_step_conf": 6.8125,
"calib/ece": 0.22936000000000004,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.448,
"calib/gap": 0.21375886524822707,
"calib/mean_conf": 0.66056,
"calib/mu_c": 0.753758865248227,
"calib/mu_w": 0.5399999999999999,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.16296000000000005,
"calib/std_conf": 0.3538594161527993,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4909367681498829,
"calib/step_q_c_n": 854.0,
"calib/step_q_gap": 0.11885811646448968,
"calib/step_q_w": 0.3720786516853932,
"calib/step_q_w_n": 890.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2433.0,
"completions/max_terminated_length": 2433.0,
"completions/mean_length": 521.33203125,
"completions/mean_terminated_length": 521.33203125,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.046845778822898865,
"kl": 0.15727996826171875,
"learning_rate": 3.5833333333333335e-06,
"loss": -0.007,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.034949902445077896,
"mask/share_reasoning": 0.8204107284545898,
"mask/share_step_conf": 0.14463931322097778,
"num_tokens": 16649521.0,
"reward": 1.1325509548187256,
"reward_std": 0.23369169235229492,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.707699179649353,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8346226215362549,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.6370866894721985,
"adv/mean_abs_reasoning": 0.3830593228340149,
"adv/mean_abs_step_conf": 0.7415189146995544,
"adv/ratio_final_to_reasoning": 1.6631541160747503,
"adv/ratio_step_to_reasoning": 1.9357808843119195,
"adv/std_final_conf": 0.8679909706115723,
"adv/std_reasoning": 0.6815306544303894,
"adv/std_step_conf": 0.9352692365646362,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8205185659411013,
"calib/avg_num_step_conf": 6.53125,
"calib/ece": 0.21690476190476196,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5952380952380952,
"calib/gap": 0.3527323943661972,
"calib/mean_conf": 0.7747619047619047,
"calib/mu_c": 0.9287323943661971,
"calib/mu_w": 0.576,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21408730158730163,
"calib/std_conf": 0.3167887340812743,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.542945914844649,
"calib/step_q_c_n": 869.0,
"calib/step_q_gap": 0.1503058152182481,
"calib/step_q_w": 0.39264009962640095,
"calib/step_q_w_n": 803.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2398.0,
"completions/max_terminated_length": 2398.0,
"completions/mean_length": 479.93359375,
"completions/mean_terminated_length": 479.93359375,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.0768,
"grad_norm": 0.05887433886528015,
"kl": 0.102203369140625,
"learning_rate": 3.555555555555556e-06,
"loss": 0.0013,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03538592532277107,
"mask/share_reasoning": 0.8194608688354492,
"mask/share_step_conf": 0.14515320956707,
"num_tokens": 16876792.0,
"reward": 1.1849489212036133,
"reward_std": 0.1923227161169052,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7667582035064697,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8640721440315247,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.6404528617858887,
"adv/mean_abs_reasoning": 0.5191984176635742,
"adv/mean_abs_step_conf": 0.7633548378944397,
"adv/ratio_final_to_reasoning": 1.2335416287822432,
"adv/ratio_step_to_reasoning": 1.47025647984365,
"adv/std_final_conf": 0.8272820711135864,
"adv/std_reasoning": 0.7753027677536011,
"adv/std_step_conf": 0.9352089166641235,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7346065406234031,
"calib/avg_num_step_conf": 6.1875,
"calib/ece": 0.21533333333333327,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6431372549019608,
"calib/gap": 0.2979605263157894,
"calib/mean_conf": 0.777607843137255,
"calib/mu_c": 0.8979605263157895,
"calib/mu_w": 0.6000000000000001,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.19843137254901952,
"calib/std_conf": 0.33210109956253087,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.54545067264574,
"calib/step_q_c_n": 892.0,
"calib/step_q_gap": 0.12984807148967065,
"calib/step_q_w": 0.41560260115606934,
"calib/step_q_w_n": 692.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1710.0,
"completions/max_terminated_length": 1710.0,
"completions/mean_length": 468.4609375,
"completions/mean_terminated_length": 470.2980651855469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.027524832636117935,
"kl": 0.09661865234375,
"learning_rate": 3.5277777777777784e-06,
"loss": -0.0586,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03430914878845215,
"mask/share_reasoning": 0.8275830745697021,
"mask/share_step_conf": 0.1342015266418457,
"num_tokens": 17103750.0,
"reward": 1.1775658130645752,
"reward_std": 0.2001640647649765,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7525933384895325,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8569004535675049,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.6963641047477722,
"adv/mean_abs_reasoning": 0.4993099272251129,
"adv/mean_abs_step_conf": 0.763095498085022,
"adv/ratio_final_to_reasoning": 1.394653033673448,
"adv/ratio_step_to_reasoning": 1.5283002729905304,
"adv/std_final_conf": 0.8839811682701111,
"adv/std_reasoning": 0.7575883269309998,
"adv/std_step_conf": 0.9351733326911926,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.733104674796748,
"calib/avg_num_step_conf": 6.42578125,
"calib/ece": 0.2261354581673307,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.47410358565737054,
"calib/gap": 0.27355119410569095,
"calib/mean_conf": 0.6674103585657369,
"calib/mu_c": 0.806910569105691,
"calib/mu_w": 0.5333593750000001,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.20175298804780878,
"calib/std_conf": 0.361738920694594,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4805170068027211,
"calib/step_q_c_n": 735.0,
"calib/step_q_gap": 0.09661590790162217,
"calib/step_q_w": 0.3839010989010989,
"calib/step_q_w_n": 910.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2426.0,
"completions/max_terminated_length": 2426.0,
"completions/mean_length": 495.07421875,
"completions/mean_terminated_length": 498.9724426269531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.029221879318356514,
"kl": 0.10986328125,
"learning_rate": 3.5e-06,
"loss": -0.0388,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.035621289163827896,
"mask/share_reasoning": 0.8160022497177124,
"mask/share_step_conf": 0.1405639499425888,
"num_tokens": 17334417.0,
"reward": 1.1277399063110352,
"reward_std": 0.23573684692382812,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.7065363526344299,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8383581638336182,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.5262219309806824,
"adv/mean_abs_reasoning": 0.47117334604263306,
"adv/mean_abs_step_conf": 0.7681576013565063,
"adv/ratio_final_to_reasoning": 1.1168329775026544,
"adv/ratio_step_to_reasoning": 1.630307842767917,
"adv/std_final_conf": 0.778659999370575,
"adv/std_reasoning": 0.7391869425773621,
"adv/std_step_conf": 0.9349520206451416,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8094276094276096,
"calib/avg_num_step_conf": 5.77734375,
"calib/ece": 0.1750980392156863,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.7137254901960784,
"calib/gap": 0.4265555555555557,
"calib/mean_conf": 0.8001176470588235,
"calib/mu_c": 0.9506666666666668,
"calib/mu_w": 0.5241111111111111,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16407843137254904,
"calib/std_conf": 0.34238778880660364,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5480240174672488,
"calib/step_q_c_n": 916.0,
"calib/step_q_gap": 0.12948050059335897,
"calib/step_q_w": 0.41854351687388985,
"calib/step_q_w_n": 563.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1407.0,
"completions/max_terminated_length": 1407.0,
"completions/mean_length": 413.14453125,
"completions/mean_terminated_length": 414.7647399902344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.08,
"grad_norm": 0.035005368292331696,
"kl": 0.12042236328125,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.0406,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.038583215326070786,
"mask/share_reasoning": 0.8095848560333252,
"mask/share_step_conf": 0.14792568981647491,
"num_tokens": 17544934.0,
"reward": 1.2190998792648315,
"reward_std": 0.19501495361328125,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.8225722312927246,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8583348989486694,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.5986725091934204,
"adv/mean_abs_reasoning": 0.49344608187675476,
"adv/mean_abs_step_conf": 0.7562553882598877,
"adv/ratio_final_to_reasoning": 1.2132480754866901,
"adv/ratio_step_to_reasoning": 1.5325998443103928,
"adv/std_final_conf": 0.8273491263389587,
"adv/std_reasoning": 0.7574414610862732,
"adv/std_step_conf": 0.9353153109550476,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7254597254597255,
"calib/avg_num_step_conf": 5.83203125,
"calib/ece": 0.19760784313725496,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.611764705882353,
"calib/gap": 0.3269153069153068,
"calib/mean_conf": 0.7394901960784314,
"calib/mu_c": 0.8664102564102564,
"calib/mu_w": 0.5394949494949496,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16266666666666674,
"calib/std_conf": 0.36759117169992844,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4986483253588517,
"calib/step_q_c_n": 836.0,
"calib/step_q_gap": 0.13443219141669038,
"calib/step_q_w": 0.3642161339421613,
"calib/step_q_w_n": 657.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2337.0,
"completions/max_terminated_length": 2337.0,
"completions/mean_length": 445.44921875,
"completions/mean_terminated_length": 445.44921875,
"completions/min_length": 163.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.040073275566101074,
"kl": 0.1200408935546875,
"learning_rate": 3.444444444444445e-06,
"loss": 0.1011,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.038839176297187805,
"mask/share_reasoning": 0.8191479444503784,
"mask/share_step_conf": 0.14201289415359497,
"num_tokens": 17762025.0,
"reward": 1.187859296798706,
"reward_std": 0.17804092168807983,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7633512020111084,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.860849142074585,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.694234311580658,
"adv/mean_abs_reasoning": 0.4896814823150635,
"adv/mean_abs_step_conf": 0.7751322388648987,
"adv/ratio_final_to_reasoning": 1.4177262907687087,
"adv/ratio_step_to_reasoning": 1.5829314908954935,
"adv/std_final_conf": 0.8711609244346619,
"adv/std_reasoning": 0.7207184433937073,
"adv/std_step_conf": 0.9352930784225464,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6509993337774818,
"calib/avg_num_step_conf": 6.76171875,
"calib/ece": 0.2668770750988142,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5177865612648221,
"calib/gap": 0.2290602398401066,
"calib/mean_conf": 0.638103162055336,
"calib/mu_c": 0.7241139240506329,
"calib/mu_w": 0.4950536842105263,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14023715415019758,
"calib/std_conf": 0.40786421303490994,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43599173553719006,
"calib/step_q_c_n": 968.0,
"calib/step_q_gap": 0.07213852452801572,
"calib/step_q_w": 0.36385321100917434,
"calib/step_q_w_n": 763.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1579.0,
"completions/max_terminated_length": 1579.0,
"completions/mean_length": 443.3359375,
"completions/mean_terminated_length": 448.5928955078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.035977620631456375,
"kl": 0.1226959228515625,
"learning_rate": 3.416666666666667e-06,
"loss": -0.1244,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04012027755379677,
"mask/share_reasoning": 0.7930710315704346,
"mask/share_step_conf": 0.15508995950222015,
"num_tokens": 17980183.0,
"reward": 1.145344614982605,
"reward_std": 0.19340042769908905,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.6981140375137329,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8476542234420776,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.6861317753791809,
"adv/mean_abs_reasoning": 0.42679959535598755,
"adv/mean_abs_step_conf": 0.7618663311004639,
"adv/ratio_final_to_reasoning": 1.607620491783475,
"adv/ratio_step_to_reasoning": 1.785068072674722,
"adv/std_final_conf": 0.9054518342018127,
"adv/std_reasoning": 0.7013146281242371,
"adv/std_step_conf": 0.9350811243057251,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7512456638284454,
"calib/avg_num_step_conf": 6.015625,
"calib/ece": 0.24960937499999997,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.48828125,
"calib/gap": 0.33470829391359197,
"calib/mean_conf": 0.6021875000000001,
"calib/mu_c": 0.7394701986754967,
"calib/mu_w": 0.4047619047619047,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13097656249999995,
"calib/std_conf": 0.4208203251908705,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4692647058823529,
"calib/step_q_c_n": 884.0,
"calib/step_q_gap": 0.1322372668579626,
"calib/step_q_w": 0.3370274390243903,
"calib/step_q_w_n": 656.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1291.0,
"completions/max_terminated_length": 1291.0,
"completions/mean_length": 477.24609375,
"completions/mean_terminated_length": 479.11767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.0832,
"grad_norm": 0.04717475175857544,
"kl": 0.1195831298828125,
"learning_rate": 3.3888888888888893e-06,
"loss": -0.0247,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03434506803750992,
"mask/share_reasoning": 0.8292993307113647,
"mask/share_step_conf": 0.13244935870170593,
"num_tokens": 18210382.0,
"reward": 1.1887092590332031,
"reward_std": 0.18209871649742126,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7424741983413696,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8785045742988586,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.6638272404670715,
"adv/mean_abs_reasoning": 0.46231958270072937,
"adv/mean_abs_step_conf": 0.7449385523796082,
"adv/ratio_final_to_reasoning": 1.4358622591524162,
"adv/ratio_step_to_reasoning": 1.6113065079958442,
"adv/std_final_conf": 0.8744451999664307,
"adv/std_reasoning": 0.7206392884254456,
"adv/std_step_conf": 0.9351168274879456,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6891389432485323,
"calib/avg_num_step_conf": 6.79296875,
"calib/ece": 0.24629482071713144,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.4820717131474104,
"calib/gap": 0.3075264187866928,
"calib/mean_conf": 0.6057370517928288,
"calib/mu_c": 0.7343835616438356,
"calib/mu_w": 0.42685714285714277,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13517928286852587,
"calib/std_conf": 0.4147179651512537,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4479030439684329,
"calib/step_q_c_n": 887.0,
"calib/step_q_gap": 0.14023872471960652,
"calib/step_q_w": 0.30766431924882637,
"calib/step_q_w_n": 852.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2616.0,
"completions/max_terminated_length": 2616.0,
"completions/mean_length": 480.0703125,
"completions/mean_terminated_length": 487.69049072265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.12207309156656265,
"kl": 0.6846923828125,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0241,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03398709371685982,
"mask/share_reasoning": 0.8186401128768921,
"mask/share_step_conf": 0.1317477524280548,
"num_tokens": 18439656.0,
"reward": 1.1509122848510742,
"reward_std": 0.21398352086544037,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7164999842643738,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8506331443786621,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.5320920944213867,
"adv/mean_abs_reasoning": 0.3927716910839081,
"adv/mean_abs_step_conf": 0.750799298286438,
"adv/ratio_final_to_reasoning": 1.3547109083982214,
"adv/ratio_step_to_reasoning": 1.9115412727798762,
"adv/std_final_conf": 0.7759950757026672,
"adv/std_reasoning": 0.6815386414527893,
"adv/std_step_conf": 0.9349751472473145,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6903703703703703,
"calib/avg_num_step_conf": 6.390625,
"calib/ece": 0.2574117647058824,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.7098039215686275,
"calib/gap": 0.2938585858585858,
"calib/mean_conf": 0.7692549019607843,
"calib/mu_c": 0.872969696969697,
"calib/mu_w": 0.5791111111111111,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18980392156862746,
"calib/std_conf": 0.3844199012995899,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.45761523046092184,
"calib/step_q_c_n": 998.0,
"calib/step_q_gap": 0.12296005804712873,
"calib/step_q_w": 0.3346551724137931,
"calib/step_q_w_n": 638.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2091.0,
"completions/max_terminated_length": 2091.0,
"completions/mean_length": 427.796875,
"completions/mean_terminated_length": 429.47454833984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.22899942100048065,
"kl": 1.1295013427734375,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.033,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03959141671657562,
"mask/share_reasoning": 0.807418704032898,
"mask/share_step_conf": 0.14908364415168762,
"num_tokens": 18651332.0,
"reward": 1.1782162189483643,
"reward_std": 0.17925795912742615,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.740231990814209,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8587168455123901,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.6153532266616821,
"adv/mean_abs_reasoning": 0.5657199025154114,
"adv/mean_abs_step_conf": 0.7426446676254272,
"adv/ratio_final_to_reasoning": 1.087734802904373,
"adv/ratio_step_to_reasoning": 1.3127426917867646,
"adv/std_final_conf": 0.8280157446861267,
"adv/std_reasoning": 0.8098756670951843,
"adv/std_step_conf": 0.9354367852210999,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7782327301545371,
"calib/avg_num_step_conf": 6.99609375,
"calib/ece": 0.21093117408906886,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.582995951417004,
"calib/gap": 0.46848236692642986,
"calib/mean_conf": 0.6295546558704453,
"calib/mu_c": 0.8438805970149255,
"calib/mu_w": 0.3753982300884956,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1489878542510122,
"calib/std_conf": 0.45019766694245333,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.47373198847262254,
"calib/step_q_c_n": 694.0,
"calib/step_q_gap": 0.19115222548264987,
"calib/step_q_w": 0.28257976298997267,
"calib/step_q_w_n": 1097.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3006.0,
"completions/max_terminated_length": 3006.0,
"completions/mean_length": 523.76171875,
"completions/mean_terminated_length": 529.9723510742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.0864,
"grad_norm": 0.04257092624902725,
"kl": 0.1041107177734375,
"learning_rate": 3.3055555555555558e-06,
"loss": 0.0727,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03545115515589714,
"mask/share_reasoning": 0.8166124820709229,
"mask/share_step_conf": 0.1362176388502121,
"num_tokens": 18891663.0,
"reward": 1.1495893001556396,
"reward_std": 0.2690851092338562,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7468858957290649,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8364243507385254,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.6698199510574341,
"adv/mean_abs_reasoning": 0.5826090574264526,
"adv/mean_abs_step_conf": 0.7445812821388245,
"adv/ratio_final_to_reasoning": 1.1496902468633363,
"adv/ratio_step_to_reasoning": 1.2780118548582964,
"adv/std_final_conf": 0.8605046272277832,
"adv/std_reasoning": 0.8098371028900146,
"adv/std_step_conf": 0.9354823231697083,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.683967408738051,
"calib/avg_num_step_conf": 6.26171875,
"calib/ece": 0.31599206349206355,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6388888888888888,
"calib/gap": 0.2821293385513568,
"calib/mean_conf": 0.7020238095238095,
"calib/mu_c": 0.824055944055944,
"calib/mu_w": 0.5419266055045872,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22527777777777783,
"calib/std_conf": 0.4171723971656276,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.46937583001328026,
"calib/step_q_c_n": 753.0,
"calib/step_q_gap": 0.16399935942504495,
"calib/step_q_w": 0.3053764705882353,
"calib/step_q_w_n": 850.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2562.0,
"completions/max_terminated_length": 2562.0,
"completions/mean_length": 445.4140625,
"completions/mean_terminated_length": 445.4140625,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.05169864743947983,
"kl": 0.134735107421875,
"learning_rate": 3.277777777777778e-06,
"loss": 0.0822,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.039256948977708817,
"mask/share_reasoning": 0.8154367208480835,
"mask/share_step_conf": 0.14530633389949799,
"num_tokens": 19111241.0,
"reward": 1.1285752058029175,
"reward_std": 0.26243820786476135,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6821433305740356,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8453171253204346,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.6228658556938171,
"adv/mean_abs_reasoning": 0.4538366198539734,
"adv/mean_abs_step_conf": 0.7593032121658325,
"adv/ratio_final_to_reasoning": 1.3724451233006068,
"adv/ratio_step_to_reasoning": 1.6730761224383923,
"adv/std_final_conf": 0.8233391642570496,
"adv/std_reasoning": 0.7206999063491821,
"adv/std_step_conf": 0.9348890781402588,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6860727471625871,
"calib/avg_num_step_conf": 6.4765625,
"calib/ece": 0.273705306122449,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.5877551020408164,
"calib/gap": 0.3269244906331191,
"calib/mean_conf": 0.6560089795918368,
"calib/mu_c": 0.7934507042253521,
"calib/mu_w": 0.466526213592233,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.17506122448979594,
"calib/std_conf": 0.43924072903342637,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.44755494505494503,
"calib/step_q_c_n": 728.0,
"calib/step_q_gap": 0.17971623537752562,
"calib/step_q_w": 0.2678387096774194,
"calib/step_q_w_n": 930.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2860.0,
"completions/max_terminated_length": 2860.0,
"completions/mean_length": 510.4765625,
"completions/mean_terminated_length": 516.5296630859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.04278864711523056,
"kl": 0.11474609375,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.0214,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03520066291093826,
"mask/share_reasoning": 0.8231779932975769,
"mask/share_step_conf": 0.12990263104438782,
"num_tokens": 19349187.0,
"reward": 1.1222634315490723,
"reward_std": 0.2234598845243454,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6821732521057129,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8405274748802185,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.6835577487945557,
"adv/mean_abs_reasoning": 0.48697832226753235,
"adv/mean_abs_step_conf": 0.7442065477371216,
"adv/ratio_final_to_reasoning": 1.4036718218003716,
"adv/ratio_step_to_reasoning": 1.528212886914246,
"adv/std_final_conf": 0.8857553005218506,
"adv/std_reasoning": 0.7393399477005005,
"adv/std_step_conf": 0.9358551502227783,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8175134526342136,
"calib/avg_num_step_conf": 5.57421875,
"calib/ece": 0.17885375494071146,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5454545454545454,
"calib/gap": 0.551800775872857,
"calib/mean_conf": 0.6150592885375494,
"calib/mu_c": 0.881145038167939,
"calib/mu_w": 0.32934426229508196,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.13806324110671936,
"calib/std_conf": 0.4436211221776772,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4896778916544657,
"calib/step_q_c_n": 683.0,
"calib/step_q_gap": 0.1805784292888743,
"calib/step_q_w": 0.3090994623655914,
"calib/step_q_w_n": 744.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2236.0,
"completions/max_terminated_length": 2236.0,
"completions/mean_length": 442.5078125,
"completions/mean_terminated_length": 442.5078125,
"completions/min_length": 124.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.0896,
"grad_norm": 0.028044484555721283,
"kl": 0.1327667236328125,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.0029,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03884014114737511,
"mask/share_reasoning": 0.826440691947937,
"mask/share_step_conf": 0.13471916317939758,
"num_tokens": 19568389.0,
"reward": 1.1857885122299194,
"reward_std": 0.23890095949172974,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.8046886920928955,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.845633864402771,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.6975332498550415,
"adv/mean_abs_reasoning": 0.5464292168617249,
"adv/mean_abs_step_conf": 0.7447526454925537,
"adv/ratio_final_to_reasoning": 1.276529929825392,
"adv/ratio_step_to_reasoning": 1.3629444080055753,
"adv/std_final_conf": 0.8820512294769287,
"adv/std_reasoning": 0.7927603721618652,
"adv/std_step_conf": 0.9354966282844543,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.8047684189955051,
"calib/avg_num_step_conf": 6.09375,
"calib/ece": 0.1956620967741936,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5362903225806451,
"calib/gap": 0.5061403296202204,
"calib/mean_conf": 0.5976443548387097,
"calib/mu_c": 0.8405100775193799,
"calib/mu_w": 0.3343697478991595,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.13657258064516137,
"calib/std_conf": 0.45424748775246127,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.43061111111111117,
"calib/step_q_c_n": 720.0,
"calib/step_q_gap": 0.13943253968253977,
"calib/step_q_w": 0.2911785714285714,
"calib/step_q_w_n": 840.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2364.0,
"completions/max_terminated_length": 2364.0,
"completions/mean_length": 489.54296875,
"completions/mean_terminated_length": 493.39764404296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.02340533211827278,
"kl": 0.11705780029296875,
"learning_rate": 3.1944444444444443e-06,
"loss": 0.0119,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03687724471092224,
"mask/share_reasoning": 0.8172705173492432,
"mask/share_step_conf": 0.1380397528409958,
"num_tokens": 19801536.0,
"reward": 1.1727514266967773,
"reward_std": 0.27141836285591125,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7622454762458801,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8596714735031128,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.718398928642273,
"adv/mean_abs_reasoning": 0.5384534001350403,
"adv/mean_abs_step_conf": 0.7146424055099487,
"adv/ratio_final_to_reasoning": 1.3341896038953485,
"adv/ratio_step_to_reasoning": 1.3272130983493122,
"adv/std_final_conf": 0.8906943798065186,
"adv/std_reasoning": 0.7927916049957275,
"adv/std_step_conf": 0.9359204173088074,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7341952165481578,
"calib/avg_num_step_conf": 5.578125,
"calib/ece": 0.25289156626506026,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.4738955823293173,
"calib/gap": 0.42104072398190046,
"calib/mean_conf": 0.5242971887550201,
"calib/mu_c": 0.7441176470588236,
"calib/mu_w": 0.3230769230769231,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.14963855421686748,
"calib/std_conf": 0.46650155717157316,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.44880690737833595,
"calib/step_q_c_n": 637.0,
"calib/step_q_gap": 0.15325697058946108,
"calib/step_q_w": 0.29554993678887487,
"calib/step_q_w_n": 791.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2740.0,
"completions/max_terminated_length": 2740.0,
"completions/mean_length": 464.55078125,
"completions/mean_terminated_length": 466.3725891113281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.03036743961274624,
"kl": 0.1243438720703125,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.1255,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.039164334535598755,
"mask/share_reasoning": 0.8249720931053162,
"mask/share_step_conf": 0.13195732235908508,
"num_tokens": 20025973.0,
"reward": 1.1242517232894897,
"reward_std": 0.2771415114402771,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.7137933969497681,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8330358266830444,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.6766160726547241,
"adv/mean_abs_reasoning": 0.4702734351158142,
"adv/mean_abs_step_conf": 0.7363328337669373,
"adv/ratio_final_to_reasoning": 1.438771621212442,
"adv/ratio_step_to_reasoning": 1.5657546839438221,
"adv/std_final_conf": 0.8864827156066895,
"adv/std_reasoning": 0.7575428485870361,
"adv/std_step_conf": 0.9356470704078674,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6348829201101929,
"calib/avg_num_step_conf": 5.29296875,
"calib/ece": 0.2603557312252964,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6640316205533597,
"calib/gap": 0.26056060606060605,
"calib/mean_conf": 0.7392490118577075,
"calib/mu_c": 0.8298787878787879,
"calib/mu_w": 0.5693181818181818,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1737154150197628,
"calib/std_conf": 0.3979252383367354,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.45692941176470586,
"calib/step_q_c_n": 850.0,
"calib/step_q_gap": 0.13245238206173554,
"calib/step_q_w": 0.3244770297029703,
"calib/step_q_w_n": 505.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2788.0,
"completions/max_terminated_length": 2788.0,
"completions/mean_length": 395.359375,
"completions/mean_terminated_length": 395.359375,
"completions/min_length": 93.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.0928,
"grad_norm": 0.029617100954055786,
"kl": 0.1478118896484375,
"learning_rate": 3.138888888888889e-06,
"loss": 0.0086,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04414728283882141,
"mask/share_reasoning": 0.8172008395195007,
"mask/share_step_conf": 0.13865187764167786,
"num_tokens": 20232681.0,
"reward": 1.141819715499878,
"reward_std": 0.2538405954837799,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7091293334960938,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.833006739616394,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.6007475852966309,
"adv/mean_abs_reasoning": 0.44460412859916687,
"adv/mean_abs_step_conf": 0.7527111172676086,
"adv/ratio_final_to_reasoning": 1.3511965963731192,
"adv/ratio_step_to_reasoning": 1.6929917399534897,
"adv/std_final_conf": 0.8415180444717407,
"adv/std_reasoning": 0.7013773322105408,
"adv/std_step_conf": 0.9355311393737793,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.834070796460177,
"calib/avg_num_step_conf": 5.0859375,
"calib/ece": 0.1724274509803922,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5686274509803921,
"calib/gap": 0.596774149320703,
"calib/mean_conf": 0.6180823529411765,
"calib/mu_c": 0.8825352112676057,
"calib/mu_w": 0.2857610619469027,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.11682352941176474,
"calib/std_conf": 0.4629612412125359,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.48145161290322575,
"calib/step_q_c_n": 682.0,
"calib/step_q_gap": 0.16227419354838707,
"calib/step_q_w": 0.3191774193548387,
"calib/step_q_w_n": 620.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1165.0,
"completions/max_terminated_length": 1165.0,
"completions/mean_length": 431.01171875,
"completions/mean_terminated_length": 432.7019958496094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.02849281206727028,
"kl": 0.132293701171875,
"learning_rate": 3.1111111111111116e-06,
"loss": -0.0292,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.037658434361219406,
"mask/share_reasoning": 0.8327912092208862,
"mask/share_step_conf": 0.12564414739608765,
"num_tokens": 20452868.0,
"reward": 1.18056058883667,
"reward_std": 0.21894444525241852,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.8136299848556519,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8269731402397156,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.6541560292243958,
"adv/mean_abs_reasoning": 0.537696123123169,
"adv/mean_abs_step_conf": 0.7444639801979065,
"adv/ratio_final_to_reasoning": 1.2165905631321607,
"adv/ratio_step_to_reasoning": 1.3845440727259506,
"adv/std_final_conf": 0.8602299094200134,
"adv/std_reasoning": 0.7754936814308167,
"adv/std_step_conf": 0.9357773065567017,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7566519546027743,
"calib/avg_num_step_conf": 5.01171875,
"calib/ece": 0.23392857142857146,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.48412698412698413,
"calib/gap": 0.44930517023959654,
"calib/mean_conf": 0.5343253968253967,
"calib/mu_c": 0.7518461538461539,
"calib/mu_w": 0.3025409836065574,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.1261904761904762,
"calib/std_conf": 0.46855902593331045,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.4928026533996683,
"calib/step_q_c_n": 603.0,
"calib/step_q_gap": 0.18202324163496242,
"calib/step_q_w": 0.31077941176470586,
"calib/step_q_w_n": 680.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1882.0,
"completions/max_terminated_length": 1882.0,
"completions/mean_length": 453.1171875,
"completions/mean_terminated_length": 453.1171875,
"completions/min_length": 74.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.02972985990345478,
"kl": 0.13824462890625,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.1189,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04198278859257698,
"mask/share_reasoning": 0.8361014127731323,
"mask/share_step_conf": 0.12191580981016159,
"num_tokens": 20677754.0,
"reward": 1.1438519954681396,
"reward_std": 0.2497408092021942,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7326839566230774,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8392842411994934,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.6622197031974792,
"adv/mean_abs_reasoning": 0.5669442415237427,
"adv/mean_abs_step_conf": 0.744004487991333,
"adv/ratio_final_to_reasoning": 1.1680508499701316,
"adv/ratio_step_to_reasoning": 1.3123062789944138,
"adv/std_final_conf": 0.8622347116470337,
"adv/std_reasoning": 0.792926013469696,
"adv/std_step_conf": 0.9356372356414795,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7202957674655788,
"calib/avg_num_step_conf": 5.46484375,
"calib/ece": 0.27196456692913396,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5748031496062992,
"calib/gap": 0.3847572667006629,
"calib/mean_conf": 0.6178779527559055,
"calib/mu_c": 0.7784459459459458,
"calib/mu_w": 0.39368867924528295,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.1535826771653544,
"calib/std_conf": 0.4612813257464722,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.443278364116095,
"calib/step_q_c_n": 758.0,
"calib/step_q_gap": 0.15647649204121195,
"calib/step_q_w": 0.28680187207488306,
"calib/step_q_w_n": 641.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2763.0,
"completions/max_terminated_length": 2763.0,
"completions/mean_length": 446.40625,
"completions/mean_terminated_length": 446.40625,
"completions/min_length": 134.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.096,
"grad_norm": 0.032319456338882446,
"kl": 0.1483612060546875,
"learning_rate": 3.055555555555556e-06,
"loss": -0.0047,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03996175155043602,
"mask/share_reasoning": 0.8266382813453674,
"mask/share_step_conf": 0.13339999318122864,
"num_tokens": 20895354.0,
"reward": 1.12278413772583,
"reward_std": 0.25631049275398254,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7038639783859253,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8215529322624207,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.6649396419525146,
"adv/mean_abs_reasoning": 0.6094751358032227,
"adv/mean_abs_step_conf": 0.7547518014907837,
"adv/ratio_final_to_reasoning": 1.0910037225328244,
"adv/ratio_step_to_reasoning": 1.2383635642430302,
"adv/std_final_conf": 0.8605794906616211,
"adv/std_reasoning": 0.8588709831237793,
"adv/std_step_conf": 0.9356327652931213,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7864431486880467,
"calib/avg_num_step_conf": 5.578125,
"calib/ece": 0.2057248979591837,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.5959183673469388,
"calib/gap": 0.4907748299319727,
"calib/mean_conf": 0.6532138775510203,
"calib/mu_c": 0.8495238095238095,
"calib/mu_w": 0.35874897959183677,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.12946938775510203,
"calib/std_conf": 0.4481611051105887,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.45719131614654,
"calib/step_q_c_n": 737.0,
"calib/step_q_gap": 0.1617209832955993,
"calib/step_q_w": 0.2954703328509407,
"calib/step_q_w_n": 691.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2508.0,
"completions/max_terminated_length": 2508.0,
"completions/mean_length": 447.51953125,
"completions/mean_terminated_length": 447.51953125,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.026164310052990913,
"kl": 0.142333984375,
"learning_rate": 3.0277777777777776e-06,
"loss": 0.0197,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03725254535675049,
"mask/share_reasoning": 0.8388819098472595,
"mask/share_step_conf": 0.12386555224657059,
"num_tokens": 21117631.0,
"reward": 1.150740146636963,
"reward_std": 0.2667975425720215,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7564589977264404,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8258475065231323,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.6045271158218384,
"adv/mean_abs_reasoning": 0.41747045516967773,
"adv/mean_abs_step_conf": 0.7512445449829102,
"adv/ratio_final_to_reasoning": 1.4480716140166923,
"adv/ratio_step_to_reasoning": 1.7995154763169348,
"adv/std_final_conf": 0.8119957447052002,
"adv/std_reasoning": 0.7015039324760437,
"adv/std_step_conf": 0.9357256889343262,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8115202702702702,
"calib/avg_num_step_conf": 5.11328125,
"calib/ece": 0.20149193548387095,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5282258064516129,
"calib/gap": 0.5300135135135136,
"calib/mean_conf": 0.5797983870967741,
"calib/mu_c": 0.7935135135135135,
"calib/mu_w": 0.26349999999999996,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.09225806451612903,
"calib/std_conf": 0.4660389447910064,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.48583832335329347,
"calib/step_q_c_n": 668.0,
"calib/step_q_gap": 0.18569791773706884,
"calib/step_q_w": 0.3001404056162246,
"calib/step_q_w_n": 641.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1893.0,
"completions/max_terminated_length": 1893.0,
"completions/mean_length": 395.828125,
"completions/mean_terminated_length": 398.94488525390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.040191903710365295,
"kl": 0.14263916015625,
"learning_rate": 3e-06,
"loss": -0.0157,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.042106062173843384,
"mask/share_reasoning": 0.8261221051216125,
"mask/share_step_conf": 0.12395933270454407,
"num_tokens": 21325683.0,
"reward": 1.1780098676681519,
"reward_std": 0.24134156107902527,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7681527137756348,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8523279428482056,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.7300728559494019,
"adv/mean_abs_reasoning": 0.5577922463417053,
"adv/mean_abs_step_conf": 0.7381477355957031,
"adv/ratio_final_to_reasoning": 1.3088616070546037,
"adv/ratio_step_to_reasoning": 1.323338107399061,
"adv/std_final_conf": 0.8927043080329895,
"adv/std_reasoning": 0.8100207448005676,
"adv/std_step_conf": 0.9362002611160278,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7275034293552811,
"calib/avg_num_step_conf": 5.76171875,
"calib/ece": 0.25652949245541834,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.43621399176954734,
"calib/gap": 0.40540123456790117,
"calib/mean_conf": 0.4959807956104253,
"calib/mu_c": 0.7212037037037036,
"calib/mu_w": 0.3158024691358024,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.15403292181069955,
"calib/std_conf": 0.46841187055040207,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.5340118577075099,
"calib/step_q_c_n": 506.0,
"calib/step_q_gap": 0.23139059867758216,
"calib/step_q_w": 0.30262125902992776,
"calib/step_q_w_n": 969.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2944.0,
"completions/max_terminated_length": 2944.0,
"completions/mean_length": 464.828125,
"completions/mean_terminated_length": 466.6510009765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.0992,
"grad_norm": 0.037815142422914505,
"kl": 0.1412811279296875,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.1825,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.03980404883623123,
"mask/share_reasoning": 0.8261933326721191,
"mask/share_step_conf": 0.13009636104106903,
"num_tokens": 21550455.0,
"reward": 1.0781924724578857,
"reward_std": 0.3342801332473755,
"rewards/accuracy_reward_step": 0.421875,
"rewards/final_brier_reward_step": 0.6839195489883423,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.8003935217857361,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.6486687660217285,
"adv/mean_abs_reasoning": 0.5233867168426514,
"adv/mean_abs_step_conf": 0.7764137387275696,
"adv/ratio_final_to_reasoning": 1.2393680335161072,
"adv/ratio_step_to_reasoning": 1.4834418103143934,
"adv/std_final_conf": 0.8439018130302429,
"adv/std_reasoning": 0.7754757404327393,
"adv/std_step_conf": 0.9355916976928711,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7897025171624714,
"calib/avg_num_step_conf": 5.125,
"calib/ece": 0.2174999999999999,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.45161290322580644,
"calib/gap": 0.47784897025171624,
"calib/mean_conf": 0.5110483870967741,
"calib/mu_c": 0.7326315789473684,
"calib/mu_w": 0.25478260869565217,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.09612903225806443,
"calib/std_conf": 0.4654729099708739,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5159848484848484,
"calib/step_q_c_n": 660.0,
"calib/step_q_gap": 0.20066276259527782,
"calib/step_q_w": 0.31532208588957056,
"calib/step_q_w_n": 652.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2713.0,
"completions/max_terminated_length": 2713.0,
"completions/mean_length": 407.4921875,
"completions/mean_terminated_length": 410.7007751464844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.03030179999768734,
"kl": 0.14703369140625,
"learning_rate": 2.944444444444445e-06,
"loss": -0.0434,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.04173561558127403,
"mask/share_reasoning": 0.8220940232276917,
"mask/share_step_conf": 0.12835785746574402,
"num_tokens": 21763453.0,
"reward": 1.1417219638824463,
"reward_std": 0.24963004887104034,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7397593855857849,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8312063217163086,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.5931450128555298,
"adv/mean_abs_reasoning": 0.4570343494415283,
"adv/mean_abs_step_conf": 0.7530524730682373,
"adv/ratio_final_to_reasoning": 1.297812765233776,
"adv/ratio_step_to_reasoning": 1.6476933823211044,
"adv/std_final_conf": 0.81406569480896,
"adv/std_reasoning": 0.7394242286682129,
"adv/std_step_conf": 0.9350123405456543,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8168081494057726,
"calib/avg_num_step_conf": 5.34375,
"calib/ece": 0.18647999999999998,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.596,
"calib/gap": 0.5381324278438031,
"calib/mean_conf": 0.6447999999999999,
"calib/mu_c": 0.8492903225806453,
"calib/mu_w": 0.31115789473684213,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.10563999999999997,
"calib/std_conf": 0.45376222848535996,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.4892594339622641,
"calib/step_q_c_n": 848.0,
"calib/step_q_gap": 0.1607017416545718,
"calib/step_q_w": 0.32855769230769233,
"calib/step_q_w_n": 520.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1066.0,
"completions/max_terminated_length": 1066.0,
"completions/mean_length": 417.03515625,
"completions/mean_terminated_length": 418.6706237792969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 61.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.023666884750127792,
"kl": 0.13555908203125,
"learning_rate": 2.916666666666667e-06,
"loss": -0.0643,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.04241481423377991,
"mask/share_reasoning": 0.8130682706832886,
"mask/share_step_conf": 0.14061065018177032,
"num_tokens": 21976342.0,
"reward": 1.1643306016921997,
"reward_std": 0.25867539644241333,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7615358829498291,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.837979257106781,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.5102696418762207,
"adv/mean_abs_reasoning": 0.3827498257160187,
"adv/mean_abs_step_conf": 0.7660720944404602,
"adv/ratio_final_to_reasoning": 1.3331675355348571,
"adv/ratio_step_to_reasoning": 2.0014956061896356,
"adv/std_final_conf": 0.7896233797073364,
"adv/std_reasoning": 0.6612542271614075,
"adv/std_step_conf": 0.934788167476654,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8952425125894514,
"calib/avg_num_step_conf": 4.734375,
"calib/ece": 0.11535714285714285,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5912698412698413,
"calib/gap": 0.7047031539888683,
"calib/mean_conf": 0.6286111111111111,
"calib/mu_c": 0.9026623376623376,
"calib/mu_w": 0.19795918367346935,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.06642857142857142,
"calib/std_conf": 0.46566309560314306,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5700742942050521,
"calib/step_q_c_n": 673.0,
"calib/step_q_gap": 0.2269945168395604,
"calib/step_q_w": 0.34307977736549167,
"calib/step_q_w_n": 539.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1718.0,
"completions/max_terminated_length": 1718.0,
"completions/mean_length": 396.13671875,
"completions/mean_terminated_length": 396.13671875,
"completions/min_length": 110.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.1024,
"grad_norm": 0.03511851280927658,
"kl": 0.15106201171875,
"learning_rate": 2.888888888888889e-06,
"loss": 0.0488,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.044577062129974365,
"mask/share_reasoning": 0.8310078382492065,
"mask/share_step_conf": 0.12441502511501312,
"num_tokens": 22183569.0,
"reward": 1.2230329513549805,
"reward_std": 0.18603476881980896,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.8587906360626221,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8477666974067688,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.775114893913269,
"adv/mean_abs_reasoning": 0.6093622446060181,
"adv/mean_abs_step_conf": 0.7242501378059387,
"adv/ratio_final_to_reasoning": 1.2720100412758883,
"adv/ratio_step_to_reasoning": 1.188537925046868,
"adv/std_final_conf": 0.9237008690834045,
"adv/std_reasoning": 0.8266138434410095,
"adv/std_step_conf": 0.9355844855308533,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6741730279898219,
"calib/avg_num_step_conf": 5.48828125,
"calib/ece": 0.30406374501992034,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.46613545816733065,
"calib/gap": 0.3173810432569975,
"calib/mean_conf": 0.5304382470119522,
"calib/mu_c": 0.6960833333333334,
"calib/mu_w": 0.3787022900763359,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.17820717131474106,
"calib/std_conf": 0.4694815814810152,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5330985915492958,
"calib/step_q_c_n": 568.0,
"calib/step_q_gap": 0.21296716980497082,
"calib/step_q_w": 0.320131421744325,
"calib/step_q_w_n": 837.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1150.0,
"completions/max_terminated_length": 1150.0,
"completions/mean_length": 383.9453125,
"completions/mean_terminated_length": 385.45098876953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.03347829729318619,
"kl": 0.1522064208984375,
"learning_rate": 2.861111111111111e-06,
"loss": -0.0896,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.04272129386663437,
"mask/share_reasoning": 0.812430739402771,
"mask/share_step_conf": 0.14094170928001404,
"num_tokens": 22386931.0,
"reward": 1.104257583618164,
"reward_std": 0.2778571546077728,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.6669449806213379,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8355258703231812,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.6549431085586548,
"adv/mean_abs_reasoning": 0.5944483876228333,
"adv/mean_abs_step_conf": 0.7494335174560547,
"adv/ratio_final_to_reasoning": 1.1017661452119276,
"adv/ratio_step_to_reasoning": 1.2607209188555435,
"adv/std_final_conf": 0.8280261754989624,
"adv/std_reasoning": 0.8100293874740601,
"adv/std_step_conf": 0.9358698725700378,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.7347417840375587,
"calib/avg_num_step_conf": 4.546875,
"calib/ece": 0.26029045643153526,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.5311203319502075,
"calib/gap": 0.4239486413430076,
"calib/mean_conf": 0.5760580912863071,
"calib/mu_c": 0.7502112676056338,
"calib/mu_w": 0.32626262626262625,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.12356846473029046,
"calib/std_conf": 0.47653641097218263,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5361228070175439,
"calib/step_q_c_n": 570.0,
"calib/step_q_gap": 0.20119014708488397,
"calib/step_q_w": 0.3349326599326599,
"calib/step_q_w_n": 594.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2790.0,
"completions/max_terminated_length": 2790.0,
"completions/mean_length": 448.4140625,
"completions/mean_terminated_length": 448.4140625,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.03285660967230797,
"kl": 0.13463592529296875,
"learning_rate": 2.8333333333333335e-06,
"loss": -0.067,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.04362473264336586,
"mask/share_reasoning": 0.8375781178474426,
"mask/share_step_conf": 0.1187971755862236,
"num_tokens": 22607909.0,
"reward": 1.088001012802124,
"reward_std": 0.30376923084259033,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6849917769432068,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.7955691814422607,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.6276251077651978,
"adv/mean_abs_reasoning": 0.4622398614883423,
"adv/mean_abs_step_conf": 0.7264397144317627,
"adv/ratio_final_to_reasoning": 1.3577909653752924,
"adv/ratio_step_to_reasoning": 1.5715644083414548,
"adv/std_final_conf": 0.8565012216567993,
"adv/std_reasoning": 0.7206854820251465,
"adv/std_step_conf": 0.9356908798217773,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.775219298245614,
"calib/avg_num_step_conf": 5.3515625,
"calib/ece": 0.20413978494623658,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.32661290322580644,
"calib/gap": 0.5181816520467837,
"calib/mean_conf": 0.3597311827956989,
"calib/mu_c": 0.677326388888889,
"calib/mu_w": 0.15914473684210526,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.08838709677419354,
"calib/std_conf": 0.459931153639939,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.4832096069868996,
"calib/step_q_c_n": 458.0,
"calib/step_q_gap": 0.17074250172374167,
"calib/step_q_w": 0.3124671052631579,
"calib/step_q_w_n": 912.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2182.0,
"completions/max_terminated_length": 2182.0,
"completions/mean_length": 492.59765625,
"completions/mean_terminated_length": 494.5294494628906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 54.0,
"epoch": 0.1056,
"grad_norm": 0.030274106189608574,
"kl": 0.1233978271484375,
"learning_rate": 2.805555555555556e-06,
"loss": -0.1299,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.038488492369651794,
"mask/share_reasoning": 0.8359262943267822,
"mask/share_step_conf": 0.12167896330356598,
"num_tokens": 22839814.0,
"reward": 1.1263723373413086,
"reward_std": 0.2581334710121155,
"rewards/accuracy_reward_step": 0.375,
"rewards/final_brier_reward_step": 0.762831449508667,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8161922693252563,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.5714178085327148,
"adv/mean_abs_reasoning": 0.4815468192100525,
"adv/mean_abs_step_conf": 0.7494217157363892,
"adv/ratio_final_to_reasoning": 1.186629805737457,
"adv/ratio_step_to_reasoning": 1.556280066319966,
"adv/std_final_conf": 0.8202148675918579,
"adv/std_reasoning": 0.7575972080230713,
"adv/std_step_conf": 0.9345163702964783,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8089188488462536,
"calib/avg_num_step_conf": 5.44921875,
"calib/ece": 0.1883534136546185,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.46987951807228917,
"calib/gap": 0.5778376976925071,
"calib/mean_conf": 0.5196787148594376,
"calib/mu_c": 0.7888721804511278,
"calib/mu_w": 0.21103448275862066,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.08694779116465866,
"calib/std_conf": 0.48183540615364495,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5101029411764706,
"calib/step_q_c_n": 680.0,
"calib/step_q_gap": 0.21871832579185518,
"calib/step_q_w": 0.2913846153846154,
"calib/step_q_w_n": 715.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2196.0,
"completions/max_terminated_length": 2196.0,
"completions/mean_length": 464.1171875,
"completions/mean_terminated_length": 464.1171875,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.03927968069911003,
"kl": 0.1226654052734375,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.0249,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.040494032204151154,
"mask/share_reasoning": 0.8294671773910522,
"mask/share_step_conf": 0.13003885746002197,
"num_tokens": 23066036.0,
"reward": 1.1902015209197998,
"reward_std": 0.20735013484954834,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7843140363693237,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8651010394096375,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.6751835346221924,
"adv/mean_abs_reasoning": 0.5666059255599976,
"adv/mean_abs_step_conf": 0.7753067016601562,
"adv/ratio_final_to_reasoning": 1.1916280860544894,
"adv/ratio_step_to_reasoning": 1.3683349691302504,
"adv/std_final_conf": 0.8751931190490723,
"adv/std_reasoning": 0.8098925948143005,
"adv/std_step_conf": 0.9354108572006226,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7181372549019608,
"calib/avg_num_step_conf": 5.296875,
"calib/ece": 0.2703238866396761,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.31983805668016196,
"calib/gap": 0.3784651563328034,
"calib/mean_conf": 0.3620647773279352,
"calib/mu_c": 0.5704504504504505,
"calib/mu_w": 0.19198529411764706,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.09149797570850204,
"calib/std_conf": 0.4572796120287642,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.4386486486486486,
"calib/step_q_c_n": 629.0,
"calib/step_q_gap": 0.10985910256886872,
"calib/step_q_w": 0.3287895460797799,
"calib/step_q_w_n": 727.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2491.0,
"completions/max_terminated_length": 2491.0,
"completions/mean_length": 475.5703125,
"completions/mean_terminated_length": 475.5703125,
"completions/min_length": 120.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.028355449438095093,
"kl": 0.127655029296875,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0626,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.036653559654951096,
"mask/share_reasoning": 0.8383820652961731,
"mask/share_step_conf": 0.12496437877416611,
"num_tokens": 23294774.0,
"reward": 1.1122589111328125,
"reward_std": 0.24153487384319305,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.6937956809997559,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8345438241958618,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.6679022312164307,
"adv/mean_abs_reasoning": 0.41688475012779236,
"adv/mean_abs_step_conf": 0.747225284576416,
"adv/ratio_final_to_reasoning": 1.602126801260279,
"adv/ratio_step_to_reasoning": 1.792402538944782,
"adv/std_final_conf": 0.843447744846344,
"adv/std_reasoning": 0.6817389130592346,
"adv/std_step_conf": 0.9350944757461548,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7101910828025477,
"calib/avg_num_step_conf": 5.30078125,
"calib/ece": 0.3211952191235059,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.41434262948207173,
"calib/gap": 0.35007318064778425,
"calib/mean_conf": 0.47450199203187254,
"calib/mu_c": 0.6056050955414013,
"calib/mu_w": 0.25553191489361704,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.08509960159362544,
"calib/std_conf": 0.4715871850502317,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4950946142649199,
"calib/step_q_c_n": 687.0,
"calib/step_q_gap": 0.19806476351865127,
"calib/step_q_w": 0.29702985074626864,
"calib/step_q_w_n": 670.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2019.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 388.14453125,
"completions/mean_terminated_length": 388.14453125,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.1088,
"grad_norm": 0.046899620443582535,
"kl": 0.139801025390625,
"learning_rate": 2.7222222222222224e-06,
"loss": -0.0034,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.04749134182929993,
"mask/share_reasoning": 0.8126708269119263,
"mask/share_step_conf": 0.139837846159935,
"num_tokens": 23500835.0,
"reward": 1.130561113357544,
"reward_std": 0.241227924823761,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6594758033752441,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8568267822265625,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.6553860902786255,
"adv/mean_abs_reasoning": 0.46481823921203613,
"adv/mean_abs_step_conf": 0.7428863048553467,
"adv/ratio_final_to_reasoning": 1.4099835914133696,
"adv/ratio_step_to_reasoning": 1.5982296781526772,
"adv/std_final_conf": 0.8601471185684204,
"adv/std_reasoning": 0.7576319575309753,
"adv/std_step_conf": 0.9357208609580994,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7938649080735413,
"calib/avg_num_step_conf": 5.4140625,
"calib/ece": 0.20838056680161943,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.4979757085020243,
"calib/gap": 0.5344164668265388,
"calib/mean_conf": 0.5435222672064778,
"calib/mu_c": 0.7771942446043165,
"calib/mu_w": 0.24277777777777776,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.09457489878542506,
"calib/std_conf": 0.4767536799716597,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5236781609195402,
"calib/step_q_c_n": 696.0,
"calib/step_q_gap": 0.16034482758620683,
"calib/step_q_w": 0.3633333333333334,
"calib/step_q_w_n": 690.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2578.0,
"completions/max_terminated_length": 2578.0,
"completions/mean_length": 472.52734375,
"completions/mean_terminated_length": 474.38043212890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.03498329222202301,
"kl": 0.1224365234375,
"learning_rate": 2.6944444444444444e-06,
"loss": -0.0804,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03836042433977127,
"mask/share_reasoning": 0.8347409963607788,
"mask/share_step_conf": 0.12299229949712753,
"num_tokens": 23726354.0,
"reward": 1.1513614654541016,
"reward_std": 0.28517740964889526,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7498066425323486,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8357982635498047,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.6393879055976868,
"adv/mean_abs_reasoning": 0.43127331137657166,
"adv/mean_abs_step_conf": 0.7517832517623901,
"adv/ratio_final_to_reasoning": 1.4825584814345196,
"adv/ratio_step_to_reasoning": 1.7431712835714084,
"adv/std_final_conf": 0.8423436880111694,
"adv/std_reasoning": 0.7014029622077942,
"adv/std_step_conf": 0.9348614811897278,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7802152579930357,
"calib/avg_num_step_conf": 5.5703125,
"calib/ece": 0.2068650793650794,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.3611111111111111,
"calib/gap": 0.5105584045584045,
"calib/mean_conf": 0.40511904761904766,
"calib/mu_c": 0.6786324786324786,
"calib/mu_w": 0.16807407407407407,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.07384920634920637,
"calib/std_conf": 0.46226708604824707,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5198888888888888,
"calib/step_q_c_n": 630.0,
"calib/step_q_gap": 0.18010496929089892,
"calib/step_q_w": 0.3397839195979899,
"calib/step_q_w_n": 796.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2500.0,
"completions/max_terminated_length": 2500.0,
"completions/mean_length": 463.06640625,
"completions/mean_terminated_length": 463.06640625,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.04268581047654152,
"kl": 0.1243896484375,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0509,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03702666983008385,
"mask/share_reasoning": 0.8287439346313477,
"mask/share_step_conf": 0.13422942161560059,
"num_tokens": 23951579.0,
"reward": 1.1680681705474854,
"reward_std": 0.1921078860759735,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.7757472991943359,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8485928773880005,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.715624213218689,
"adv/mean_abs_reasoning": 0.6200796365737915,
"adv/mean_abs_step_conf": 0.7586425542831421,
"adv/ratio_final_to_reasoning": 1.1540843643452356,
"adv/ratio_step_to_reasoning": 1.2234598744041503,
"adv/std_final_conf": 0.8816226124763489,
"adv/std_reasoning": 0.8266752362251282,
"adv/std_step_conf": 0.9358975887298584,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7314206932773109,
"calib/avg_num_step_conf": 5.65234375,
"calib/ece": 0.2634412955465587,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.4939271255060729,
"calib/gap": 0.42086462710084027,
"calib/mean_conf": 0.5375303643724696,
"calib/mu_c": 0.7556302521008403,
"calib/mu_w": 0.33476562500000007,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1595951417004049,
"calib/std_conf": 0.47572826558997283,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4693718166383702,
"calib/step_q_c_n": 589.0,
"calib/step_q_gap": 0.1266095788761324,
"calib/step_q_w": 0.3427622377622378,
"calib/step_q_w_n": 858.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3055.0,
"completions/max_terminated_length": 3055.0,
"completions/mean_length": 472.3515625,
"completions/mean_terminated_length": 472.3515625,
"completions/min_length": 93.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.112,
"grad_norm": 0.03280618041753769,
"kl": 0.11415863037109375,
"learning_rate": 2.6388888888888893e-06,
"loss": 0.0266,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03885013237595558,
"mask/share_reasoning": 0.8337293863296509,
"mask/share_step_conf": 0.12742048501968384,
"num_tokens": 24178261.0,
"reward": 1.1076858043670654,
"reward_std": 0.2959200143814087,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.6975457072257996,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8223005533218384,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.6601937413215637,
"adv/mean_abs_reasoning": 0.5205706357955933,
"adv/mean_abs_step_conf": 0.7354485988616943,
"adv/ratio_final_to_reasoning": 1.2682116430032269,
"adv/ratio_step_to_reasoning": 1.4127738836780546,
"adv/std_final_conf": 0.8471842408180237,
"adv/std_reasoning": 0.7753406167030334,
"adv/std_step_conf": 0.9351677298545837,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7849028106120304,
"calib/avg_num_step_conf": 5.69140625,
"calib/ece": 0.25281124497991964,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5140562248995983,
"calib/gap": 0.41113278171788814,
"calib/mean_conf": 0.5916064257028113,
"calib/mu_c": 0.7699290780141844,
"calib/mu_w": 0.3587962962962963,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13907630522088352,
"calib/std_conf": 0.4523759561933446,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5066219839142091,
"calib/step_q_c_n": 746.0,
"calib/step_q_gap": 0.15805658307032727,
"calib/step_q_w": 0.3485654008438818,
"calib/step_q_w_n": 711.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2512.0,
"completions/max_terminated_length": 2512.0,
"completions/mean_length": 449.7421875,
"completions/mean_terminated_length": 451.50592041015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.03570527955889702,
"kl": 0.11209869384765625,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.0621,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03827952593564987,
"mask/share_reasoning": 0.8275967836380005,
"mask/share_step_conf": 0.13021747767925262,
"num_tokens": 24397979.0,
"reward": 1.1611725091934204,
"reward_std": 0.2587474584579468,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7297816276550293,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8591046929359436,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.6493844985961914,
"adv/mean_abs_reasoning": 0.5660048723220825,
"adv/mean_abs_step_conf": 0.7470730543136597,
"adv/ratio_final_to_reasoning": 1.1473125592223914,
"adv/ratio_step_to_reasoning": 1.3199056948904515,
"adv/std_final_conf": 0.8542997241020203,
"adv/std_reasoning": 0.7928820848464966,
"adv/std_step_conf": 0.9353109002113342,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.708204334365325,
"calib/avg_num_step_conf": 5.421875,
"calib/ece": 0.26161417322834657,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6496062992125984,
"calib/gap": 0.3564912280701755,
"calib/mean_conf": 0.705,
"calib/mu_c": 0.8481578947368422,
"calib/mu_w": 0.4916666666666667,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.18409448818897645,
"calib/std_conf": 0.42935288241468145,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5074205378973105,
"calib/step_q_c_n": 818.0,
"calib/step_q_gap": 0.14568369579204732,
"calib/step_q_w": 0.36173684210526313,
"calib/step_q_w_n": 570.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2293.0,
"completions/max_terminated_length": 2293.0,
"completions/mean_length": 446.109375,
"completions/mean_terminated_length": 446.109375,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.031605981290340424,
"kl": 0.1169891357421875,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.0015,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03787776082754135,
"mask/share_reasoning": 0.8313525319099426,
"mask/share_step_conf": 0.13076971471309662,
"num_tokens": 24616799.0,
"reward": 1.1570827960968018,
"reward_std": 0.24284838140010834,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7178605794906616,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8543074131011963,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.48324739933013916,
"adv/mean_abs_reasoning": 0.4229738414287567,
"adv/mean_abs_step_conf": 0.7486779689788818,
"adv/ratio_final_to_reasoning": 1.142499492871203,
"adv/ratio_step_to_reasoning": 1.7700337364834058,
"adv/std_final_conf": 0.7400717735290527,
"adv/std_reasoning": 0.7013469338417053,
"adv/std_step_conf": 0.9353437423706055,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7492725156409137,
"calib/avg_num_step_conf": 5.71875,
"calib/ece": 0.20225296442687749,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.7707509881422925,
"calib/gap": 0.4063814927979049,
"calib/mean_conf": 0.797588932806324,
"calib/mu_c": 0.9244827586206897,
"calib/mu_w": 0.5181012658227848,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.15604743083003955,
"calib/std_conf": 0.38190095175662503,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.533143153526971,
"calib/step_q_c_n": 964.0,
"calib/step_q_gap": 0.13612315352697096,
"calib/step_q_w": 0.39702,
"calib/step_q_w_n": 500.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1693.0,
"completions/max_terminated_length": 1693.0,
"completions/mean_length": 460.95703125,
"completions/mean_terminated_length": 462.7647399902344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.1152,
"grad_norm": 0.03979187458753586,
"kl": 0.1657867431640625,
"learning_rate": 2.5555555555555557e-06,
"loss": 0.0304,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03806034475564957,
"mask/share_reasoning": 0.8202017545700073,
"mask/share_step_conf": 0.1378316879272461,
"num_tokens": 24838036.0,
"reward": 1.1871888637542725,
"reward_std": 0.21590068936347961,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7780038714408875,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8434158563613892,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.5771458148956299,
"adv/mean_abs_reasoning": 0.42229336500167847,
"adv/mean_abs_step_conf": 0.717828631401062,
"adv/ratio_final_to_reasoning": 1.3666940158847534,
"adv/ratio_step_to_reasoning": 1.6998340274614756,
"adv/std_final_conf": 0.8250502943992615,
"adv/std_reasoning": 0.7206637263298035,
"adv/std_step_conf": 0.9353057146072388,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.8377148634984832,
"calib/avg_num_step_conf": 6.359375,
"calib/ece": 0.17438524590163945,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.5163934426229508,
"calib/gap": 0.5859433771486349,
"calib/mean_conf": 0.5756967213114754,
"calib/mu_c": 0.8854782608695652,
"calib/mu_w": 0.29953488372093023,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13938524590163945,
"calib/std_conf": 0.4660418504626948,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5106009244992296,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.238884887727013,
"calib/step_q_w": 0.27171603677221656,
"calib/step_q_w_n": 979.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2946.0,
"completions/max_terminated_length": 2946.0,
"completions/mean_length": 511.7109375,
"completions/mean_terminated_length": 513.7176513671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.03111584298312664,
"kl": 0.10516357421875,
"learning_rate": 2.5277777777777778e-06,
"loss": 0.0068,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03405262529850006,
"mask/share_reasoning": 0.8269233107566833,
"mask/share_step_conf": 0.1351177990436554,
"num_tokens": 25073634.0,
"reward": 1.1639158725738525,
"reward_std": 0.26197850704193115,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.7765480279922485,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8472098708152771,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.6765207648277283,
"adv/mean_abs_reasoning": 0.579919695854187,
"adv/mean_abs_step_conf": 0.7378029227256775,
"adv/ratio_final_to_reasoning": 1.166576630633753,
"adv/ratio_step_to_reasoning": 1.2722501546338032,
"adv/std_final_conf": 0.8702954053878784,
"adv/std_reasoning": 0.8099896311759949,
"adv/std_step_conf": 0.9350488781929016,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7209169418338837,
"calib/avg_num_step_conf": 5.046875,
"calib/ece": 0.26960159362549807,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5697211155378487,
"calib/gap": 0.39499047498094986,
"calib/mean_conf": 0.6247410358565737,
"calib/mu_c": 0.8245967741935484,
"calib/mu_w": 0.4296062992125985,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.20015936254980088,
"calib/std_conf": 0.45552237063477774,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5037122557726466,
"calib/step_q_c_n": 563.0,
"calib/step_q_gap": 0.15666150131448464,
"calib/step_q_w": 0.3470507544581619,
"calib/step_q_w_n": 729.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2427.0,
"completions/max_terminated_length": 2427.0,
"completions/mean_length": 438.20703125,
"completions/mean_terminated_length": 441.657470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.04666442424058914,
"kl": 0.117340087890625,
"learning_rate": 2.5e-06,
"loss": -0.0124,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.038558922708034515,
"mask/share_reasoning": 0.827201783657074,
"mask/share_step_conf": 0.12642675638198853,
"num_tokens": 25290735.0,
"reward": 1.1315009593963623,
"reward_std": 0.24479171633720398,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.7048894166946411,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8439500331878662,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.5560250878334045,
"adv/mean_abs_reasoning": 0.5363640785217285,
"adv/mean_abs_step_conf": 0.7406983375549316,
"adv/ratio_final_to_reasoning": 1.0366560888377605,
"adv/ratio_step_to_reasoning": 1.380961863807823,
"adv/std_final_conf": 0.7945669889450073,
"adv/std_reasoning": 0.7754170894622803,
"adv/std_step_conf": 0.9349315762519836,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7761217347831522,
"calib/avg_num_step_conf": 5.58984375,
"calib/ece": 0.2229644268774703,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.549407114624506,
"calib/gap": 0.49260529933758285,
"calib/mean_conf": 0.603596837944664,
"calib/mu_c": 0.8508730158730159,
"calib/mu_w": 0.35826771653543305,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.16426877470355727,
"calib/std_conf": 0.4633459121415682,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.46158139534883724,
"calib/step_q_c_n": 645.0,
"calib/step_q_gap": 0.1424465352979466,
"calib/step_q_w": 0.31913486005089065,
"calib/step_q_w_n": 786.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1636.0,
"completions/max_terminated_length": 1636.0,
"completions/mean_length": 475.69140625,
"completions/mean_terminated_length": 477.556884765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.1184,
"grad_norm": 0.03786475956439972,
"kl": 0.126617431640625,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.0007,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03777540475130081,
"mask/share_reasoning": 0.826372504234314,
"mask/share_step_conf": 0.13194583356380463,
"num_tokens": 25519920.0,
"reward": 1.154246211051941,
"reward_std": 0.21423739194869995,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7497198581695557,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8433482646942139,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.5513437390327454,
"adv/mean_abs_reasoning": 0.4522683918476105,
"adv/mean_abs_step_conf": 0.7459721565246582,
"adv/ratio_final_to_reasoning": 1.2190631690629352,
"adv/ratio_step_to_reasoning": 1.6494014836570974,
"adv/std_final_conf": 0.7818516492843628,
"adv/std_reasoning": 0.7205687761306763,
"adv/std_step_conf": 0.9342420101165771,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.8228179143510952,
"calib/avg_num_step_conf": 5.85546875,
"calib/ece": 0.19661290322580646,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.4435483870967742,
"calib/gap": 0.5576894409937889,
"calib/mean_conf": 0.48282258064516126,
"calib/mu_c": 0.7414285714285714,
"calib/mu_w": 0.18373913043478257,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07157258064516132,
"calib/std_conf": 0.47570077421304313,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4241379310344828,
"calib/step_q_c_n": 725.0,
"calib/step_q_gap": 0.19787178116368176,
"calib/step_q_w": 0.22626614987080101,
"calib/step_q_w_n": 774.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2548.0,
"completions/max_terminated_length": 2548.0,
"completions/mean_length": 511.33203125,
"completions/mean_terminated_length": 517.395263671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 90.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.026878803968429565,
"kl": 0.09716796875,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.0867,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.032932404428720474,
"mask/share_reasoning": 0.8346401453018188,
"mask/share_step_conf": 0.12070866674184799,
"num_tokens": 25758741.0,
"reward": 1.1662858724594116,
"reward_std": 0.1943838894367218,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7745569944381714,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8402389287948608,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.7061506509780884,
"adv/mean_abs_reasoning": 0.544262170791626,
"adv/mean_abs_step_conf": 0.7459894418716431,
"adv/ratio_final_to_reasoning": 1.2974457694735546,
"adv/ratio_step_to_reasoning": 1.3706435646383544,
"adv/std_final_conf": 0.8878163695335388,
"adv/std_reasoning": 0.8097857236862183,
"adv/std_step_conf": 0.9347683787345886,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.723826597131682,
"calib/avg_num_step_conf": 6.38671875,
"calib/ece": 0.27116935483870963,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.4032258064516129,
"calib/gap": 0.36347979139504566,
"calib/mean_conf": 0.46366935483870964,
"calib/mu_c": 0.6366153846153847,
"calib/mu_w": 0.27313559322033903,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1053225806451613,
"calib/std_conf": 0.4609927964900458,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3957500000000001,
"calib/step_q_c_n": 720.0,
"calib/step_q_gap": 0.14008879781420774,
"calib/step_q_w": 0.25566120218579236,
"calib/step_q_w_n": 915.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3040.0,
"completions/max_terminated_length": 3040.0,
"completions/mean_length": 490.6796875,
"completions/mean_terminated_length": 490.6796875,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.038772035390138626,
"kl": 0.11591339111328125,
"learning_rate": 2.4166666666666667e-06,
"loss": 0.1866,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03541328012943268,
"mask/share_reasoning": 0.8271290063858032,
"mask/share_step_conf": 0.1374577134847641,
"num_tokens": 25989555.0,
"reward": 1.145193099975586,
"reward_std": 0.2337724268436432,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.6933558583259583,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8683326244354248,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.576580286026001,
"adv/mean_abs_reasoning": 0.44374608993530273,
"adv/mean_abs_step_conf": 0.7638773918151855,
"adv/ratio_final_to_reasoning": 1.2993473049194983,
"adv/ratio_step_to_reasoning": 1.7214290089329176,
"adv/std_final_conf": 0.8202750086784363,
"adv/std_reasoning": 0.7392296195030212,
"adv/std_step_conf": 0.934648871421814,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.847147626651183,
"calib/avg_num_step_conf": 6.0078125,
"calib/ece": 0.15614457831325304,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5542168674698795,
"calib/gap": 0.6127710843373495,
"calib/mean_conf": 0.625863453815261,
"calib/mu_c": 0.8301204819277109,
"calib/mu_w": 0.21734939759036143,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.05767068273092371,
"calib/std_conf": 0.45038421508140897,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.45877307274701407,
"calib/step_q_c_n": 921.0,
"calib/step_q_gap": 0.2278654552429622,
"calib/step_q_w": 0.23090761750405187,
"calib/step_q_w_n": 617.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2338.0,
"completions/max_terminated_length": 2338.0,
"completions/mean_length": 444.98828125,
"completions/mean_terminated_length": 450.26483154296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.1216,
"grad_norm": 0.06637249886989594,
"kl": 0.3884124755859375,
"learning_rate": 2.388888888888889e-06,
"loss": 0.0056,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.037197984755039215,
"mask/share_reasoning": 0.8127014636993408,
"mask/share_step_conf": 0.13838176429271698,
"num_tokens": 26208496.0,
"reward": 1.2241928577423096,
"reward_std": 0.21004557609558105,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.822487473487854,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8672654628753662,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.6964755058288574,
"adv/mean_abs_reasoning": 0.4984782338142395,
"adv/mean_abs_step_conf": 0.7788224220275879,
"adv/ratio_final_to_reasoning": 1.3972034455739197,
"adv/ratio_step_to_reasoning": 1.562400059212656,
"adv/std_final_conf": 0.8562023043632507,
"adv/std_reasoning": 0.7393795251846313,
"adv/std_step_conf": 0.9353573322296143,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7044401544401544,
"calib/avg_num_step_conf": 5.390625,
"calib/ece": 0.2933864541832669,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5139442231075697,
"calib/gap": 0.3227696267696266,
"calib/mean_conf": 0.6018326693227092,
"calib/mu_c": 0.7445714285714284,
"calib/mu_w": 0.4218018018018018,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.16872509960159363,
"calib/std_conf": 0.45197202682942783,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.40772423025435073,
"calib/step_q_c_n": 747.0,
"calib/step_q_gap": 0.06531822709479307,
"calib/step_q_w": 0.34240600315955766,
"calib/step_q_w_n": 633.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2930.0,
"completions/max_terminated_length": 2930.0,
"completions/mean_length": 441.5,
"completions/mean_terminated_length": 441.5,
"completions/min_length": 114.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.0388445183634758,
"kl": 0.11220550537109375,
"learning_rate": 2.361111111111111e-06,
"loss": -0.0193,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.037708356976509094,
"mask/share_reasoning": 0.8265572786331177,
"mask/share_step_conf": 0.13573437929153442,
"num_tokens": 26426784.0,
"reward": 1.1016981601715088,
"reward_std": 0.24980241060256958,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6808328032493591,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.812958836555481,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.6575169563293457,
"adv/mean_abs_reasoning": 0.46607887744903564,
"adv/mean_abs_step_conf": 0.766182541847229,
"adv/ratio_final_to_reasoning": 1.4107418038940056,
"adv/ratio_step_to_reasoning": 1.6438902917908114,
"adv/std_final_conf": 0.8525789380073547,
"adv/std_reasoning": 0.7393465042114258,
"adv/std_step_conf": 0.9343810081481934,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7432891280671408,
"calib/avg_num_step_conf": 6.70703125,
"calib/ece": 0.23700000000000004,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.496,
"calib/gap": 0.42522198731501054,
"calib/mean_conf": 0.57396,
"calib/mu_c": 0.7797674418604651,
"calib/mu_w": 0.35454545454545455,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.14748000000000003,
"calib/std_conf": 0.45915914278167214,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.4153658536585365,
"calib/step_q_c_n": 738.0,
"calib/step_q_gap": 0.14663245222850585,
"calib/step_q_w": 0.26873340143003066,
"calib/step_q_w_n": 979.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2281.0,
"completions/max_terminated_length": 2281.0,
"completions/mean_length": 540.48046875,
"completions/mean_terminated_length": 542.6000366210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 66.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.034233979880809784,
"kl": 0.1004180908203125,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.0016,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.032839495688676834,
"mask/share_reasoning": 0.8337111473083496,
"mask/share_step_conf": 0.12954317033290863,
"num_tokens": 26669667.0,
"reward": 1.1264750957489014,
"reward_std": 0.25903597474098206,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.711389422416687,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8334362506866455,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.6496747732162476,
"adv/mean_abs_reasoning": 0.455629825592041,
"adv/mean_abs_step_conf": 0.7362050414085388,
"adv/ratio_final_to_reasoning": 1.425882891604092,
"adv/ratio_step_to_reasoning": 1.615796420815782,
"adv/std_final_conf": 0.8587450385093689,
"adv/std_reasoning": 0.7206072807312012,
"adv/std_step_conf": 0.9353505373001099,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7187174479166667,
"calib/avg_num_step_conf": 5.8984375,
"calib/ece": 0.28959677419354835,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.38306451612903225,
"calib/gap": 0.31717187500000005,
"calib/mean_conf": 0.4939516129032258,
"calib/mu_c": 0.647421875,
"calib/mu_w": 0.33025,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.1337096774193548,
"calib/std_conf": 0.4501609363621312,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.42036231884057973,
"calib/step_q_c_n": 690.0,
"calib/step_q_gap": 0.14292817249911632,
"calib/step_q_w": 0.2774341463414634,
"calib/step_q_w_n": 820.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2075.0,
"completions/max_terminated_length": 2075.0,
"completions/mean_length": 478.65234375,
"completions/mean_terminated_length": 478.65234375,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1248,
"grad_norm": 0.059748198837041855,
"kl": 0.1706085205078125,
"learning_rate": 2.305555555555556e-06,
"loss": -0.0748,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03415881469845772,
"mask/share_reasoning": 0.8324176073074341,
"mask/share_step_conf": 0.1334235817193985,
"num_tokens": 26898802.0,
"reward": 1.1214919090270996,
"reward_std": 0.23342269659042358,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.67963707447052,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8474394083023071,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.6411893367767334,
"adv/mean_abs_reasoning": 0.46815019845962524,
"adv/mean_abs_step_conf": 0.755583643913269,
"adv/ratio_final_to_reasoning": 1.369623122849176,
"adv/ratio_step_to_reasoning": 1.613976980890745,
"adv/std_final_conf": 0.8469781875610352,
"adv/std_reasoning": 0.7574445009231567,
"adv/std_step_conf": 0.9356558322906494,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7250325520833334,
"calib/avg_num_step_conf": 7.359375,
"calib/ece": 0.2762903225806452,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.3405885416666667,
"calib/mean_conf": 0.6088709677419355,
"calib/mu_c": 0.773671875,
"calib/mu_w": 0.4330833333333333,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18451612903225806,
"calib/std_conf": 0.4435027542508566,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.42813802083333335,
"calib/step_q_c_n": 768.0,
"calib/step_q_gap": 0.13966131832437279,
"calib/step_q_w": 0.28847670250896057,
"calib/step_q_w_n": 1116.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2179.0,
"completions/max_terminated_length": 2179.0,
"completions/mean_length": 519.95703125,
"completions/mean_terminated_length": 524.0512084960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.07028115540742874,
"kl": 0.1076812744140625,
"learning_rate": 2.277777777777778e-06,
"loss": -0.103,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03212461620569229,
"mask/share_reasoning": 0.8212298154830933,
"mask/share_step_conf": 0.13883310556411743,
"num_tokens": 27135919.0,
"reward": 1.117949366569519,
"reward_std": 0.2524290680885315,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6927351951599121,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8334632515907288,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.6336255073547363,
"adv/mean_abs_reasoning": 0.501274824142456,
"adv/mean_abs_step_conf": 0.7393103837966919,
"adv/ratio_final_to_reasoning": 1.2640281874093637,
"adv/ratio_step_to_reasoning": 1.474860392323611,
"adv/std_final_conf": 0.8469235897064209,
"adv/std_reasoning": 0.7394473552703857,
"adv/std_step_conf": 0.934846818447113,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7766627312081857,
"calib/avg_num_step_conf": 6.29296875,
"calib/ece": 0.25226720647773276,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.46963562753036436,
"calib/gap": 0.4519526433162798,
"calib/mean_conf": 0.5153441295546558,
"calib/mu_c": 0.7367460317460318,
"calib/mu_w": 0.284793388429752,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.12874493927125505,
"calib/std_conf": 0.47622565394745764,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.45553129548762744,
"calib/step_q_c_n": 687.0,
"calib/step_q_gap": 0.20295553791186988,
"calib/step_q_w": 0.25257575757575756,
"calib/step_q_w_n": 924.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2925.0,
"completions/max_terminated_length": 2925.0,
"completions/mean_length": 532.75,
"completions/mean_terminated_length": 534.8392333984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.03186385706067085,
"kl": 0.100189208984375,
"learning_rate": 2.25e-06,
"loss": 0.0034,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03202106058597565,
"mask/share_reasoning": 0.8441102504730225,
"mask/share_step_conf": 0.1199624240398407,
"num_tokens": 27377367.0,
"reward": 1.1394624710083008,
"reward_std": 0.24343638122081757,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.718923807144165,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8467713594436646,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.5528663396835327,
"adv/mean_abs_reasoning": 0.4402967095375061,
"adv/mean_abs_step_conf": 0.7554687261581421,
"adv/ratio_final_to_reasoning": 1.2556676616190736,
"adv/ratio_step_to_reasoning": 1.7158173336150913,
"adv/std_final_conf": 0.8008996248245239,
"adv/std_reasoning": 0.7205032706260681,
"adv/std_step_conf": 0.935148298740387,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8480592105263157,
"calib/avg_num_step_conf": 5.50390625,
"calib/ece": 0.1924603174603174,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.4166666666666667,
"calib/gap": 0.594978947368421,
"calib/mean_conf": 0.4854761904761905,
"calib/mu_c": 0.7215789473684211,
"calib/mu_w": 0.1266,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.03738095238095231,
"calib/std_conf": 0.4695744563623826,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.41179334916864613,
"calib/step_q_c_n": 842.0,
"calib/step_q_gap": 0.14965931036794067,
"calib/step_q_w": 0.26213403880070546,
"calib/step_q_w_n": 567.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1131.0,
"completions/max_terminated_length": 1131.0,
"completions/mean_length": 443.40625,
"completions/mean_terminated_length": 448.6640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.128,
"grad_norm": 0.08257104456424713,
"kl": 0.1157684326171875,
"learning_rate": 2.222222222222222e-06,
"loss": -0.068,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.034355901181697845,
"mask/share_reasoning": 0.8268024921417236,
"mask/share_step_conf": 0.12712284922599792,
"num_tokens": 27597567.0,
"reward": 1.1937901973724365,
"reward_std": 0.19093571603298187,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7867218255996704,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8583849668502808,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.7695930004119873,
"adv/mean_abs_reasoning": 0.6066721677780151,
"adv/mean_abs_step_conf": 0.7388530969619751,
"adv/ratio_final_to_reasoning": 1.268548387889101,
"adv/ratio_step_to_reasoning": 1.217878676828184,
"adv/std_final_conf": 0.9217678904533386,
"adv/std_reasoning": 0.843062162399292,
"adv/std_step_conf": 0.935973048210144,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7125100779360387,
"calib/avg_num_step_conf": 6.828125,
"calib/ece": 0.27237704918032785,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.36885245901639346,
"calib/gap": 0.32524590163934425,
"calib/mean_conf": 0.47975409836065575,
"calib/mu_c": 0.6423770491803279,
"calib/mu_w": 0.31713114754098365,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.12606557377049177,
"calib/std_conf": 0.44842431441871206,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4331975867269985,
"calib/step_q_c_n": 663.0,
"calib/step_q_gap": 0.15232201069013213,
"calib/step_q_w": 0.28087557603686636,
"calib/step_q_w_n": 1085.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2745.0,
"completions/max_terminated_length": 2745.0,
"completions/mean_length": 553.16796875,
"completions/mean_terminated_length": 557.5236206054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.03182586282491684,
"kl": 0.09899139404296875,
"learning_rate": 2.1944444444444445e-06,
"loss": 0.0229,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.030971001833677292,
"mask/share_reasoning": 0.8363680839538574,
"mask/share_step_conf": 0.12484840303659439,
"num_tokens": 27844234.0,
"reward": 1.0988376140594482,
"reward_std": 0.30374959111213684,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6719839572906494,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.8275442123413086,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.6513844728469849,
"adv/mean_abs_reasoning": 0.4658776521682739,
"adv/mean_abs_step_conf": 0.7674317955970764,
"adv/ratio_final_to_reasoning": 1.3981878499973772,
"adv/ratio_step_to_reasoning": 1.6472818389663426,
"adv/std_final_conf": 0.8468610048294067,
"adv/std_reasoning": 0.7205932140350342,
"adv/std_step_conf": 0.9350212812423706,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8093630660120023,
"calib/avg_num_step_conf": 6.140625,
"calib/ece": 0.18643999999999997,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.52,
"calib/gap": 0.5671153846153847,
"calib/mean_conf": 0.56388,
"calib/mu_c": 0.7771153846153847,
"calib/mu_w": 0.21,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.06315999999999997,
"calib/std_conf": 0.471184619443377,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4641991341991342,
"calib/step_q_c_n": 924.0,
"calib/step_q_gap": 0.18450777617444286,
"calib/step_q_w": 0.27969135802469136,
"calib/step_q_w_n": 648.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2630.0,
"completions/max_terminated_length": 2630.0,
"completions/mean_length": 484.57421875,
"completions/mean_terminated_length": 484.57421875,
"completions/min_length": 128.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.05260329321026802,
"kl": 0.110565185546875,
"learning_rate": 2.166666666666667e-06,
"loss": -0.0138,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.033760882914066315,
"mask/share_reasoning": 0.8320001363754272,
"mask/share_step_conf": 0.1342390477657318,
"num_tokens": 28075629.0,
"reward": 1.2003474235534668,
"reward_std": 0.19919967651367188,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7869769334793091,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8643536567687988,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.6745046377182007,
"adv/mean_abs_reasoning": 0.540432870388031,
"adv/mean_abs_step_conf": 0.7357199788093567,
"adv/ratio_final_to_reasoning": 1.248082184997197,
"adv/ratio_step_to_reasoning": 1.361353128430381,
"adv/std_final_conf": 0.8684989809989929,
"adv/std_reasoning": 0.7928730845451355,
"adv/std_step_conf": 0.9353545904159546,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7710259373394965,
"calib/avg_num_step_conf": 7.02734375,
"calib/ece": 0.22268000000000002,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.44,
"calib/gap": 0.41282614278376994,
"calib/mean_conf": 0.55204,
"calib/mu_c": 0.7468939393939394,
"calib/mu_w": 0.3340677966101695,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.12336000000000005,
"calib/std_conf": 0.4447815625675147,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4186330049261084,
"calib/step_q_c_n": 812.0,
"calib/step_q_gap": 0.15670797959682775,
"calib/step_q_w": 0.26192502532928064,
"calib/step_q_w_n": 987.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2799.0,
"completions/max_terminated_length": 2799.0,
"completions/mean_length": 552.5390625,
"completions/mean_terminated_length": 556.8897705078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.1312,
"grad_norm": 0.0321660153567791,
"kl": 0.09477996826171875,
"learning_rate": 2.138888888888889e-06,
"loss": -0.021,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03134801238775253,
"mask/share_reasoning": 0.8331590890884399,
"mask/share_step_conf": 0.12768039107322693,
"num_tokens": 28322367.0,
"reward": 1.1497811079025269,
"reward_std": 0.24997322261333466,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7325191497802734,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8472995162010193,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.6126042008399963,
"adv/mean_abs_reasoning": 0.30936306715011597,
"adv/mean_abs_step_conf": 0.756053626537323,
"adv/ratio_final_to_reasoning": 1.9802111689781476,
"adv/ratio_step_to_reasoning": 2.4439039653380927,
"adv/std_final_conf": 0.8274469375610352,
"adv/std_reasoning": 0.596075713634491,
"adv/std_step_conf": 0.9345342516899109,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7629561796228463,
"calib/avg_num_step_conf": 6.10546875,
"calib/ece": 0.2468774703557313,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4505928853754941,
"calib/gap": 0.35856532356532356,
"calib/mean_conf": 0.5804743083003953,
"calib/mu_c": 0.7094444444444444,
"calib/mu_w": 0.3508791208791209,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09351778656126483,
"calib/std_conf": 0.43693119131131747,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.48198266522210187,
"calib/step_q_c_n": 923.0,
"calib/step_q_gap": 0.1471076652221019,
"calib/step_q_w": 0.334875,
"calib/step_q_w_n": 640.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2397.0,
"completions/max_terminated_length": 2397.0,
"completions/mean_length": 488.70703125,
"completions/mean_terminated_length": 488.70703125,
"completions/min_length": 122.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.042988456785678864,
"kl": 0.1159820556640625,
"learning_rate": 2.1111111111111114e-06,
"loss": 0.049,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032922565937042236,
"mask/share_reasoning": 0.8366243839263916,
"mask/share_step_conf": 0.13045307993888855,
"num_tokens": 28554292.0,
"reward": 1.163818120956421,
"reward_std": 0.19366100430488586,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7316859364509583,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8478209972381592,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.7198798060417175,
"adv/mean_abs_reasoning": 0.48377013206481934,
"adv/mean_abs_step_conf": 0.7534222602844238,
"adv/ratio_final_to_reasoning": 1.4880617018854367,
"adv/ratio_step_to_reasoning": 1.5573972230750996,
"adv/std_final_conf": 0.8781076073646545,
"adv/std_reasoning": 0.739425003528595,
"adv/std_step_conf": 0.9353218078613281,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7068089430894309,
"calib/avg_num_step_conf": 5.94921875,
"calib/ece": 0.2947011952191235,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.450199203187251,
"calib/gap": 0.28353467987804887,
"calib/mean_conf": 0.5841035856573704,
"calib/mu_c": 0.7230468750000001,
"calib/mu_w": 0.4395121951219512,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18442231075697216,
"calib/std_conf": 0.4379773926144206,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.47793696275071634,
"calib/step_q_c_n": 698.0,
"calib/step_q_gap": 0.10005817487192847,
"calib/step_q_w": 0.37787878787878787,
"calib/step_q_w_n": 825.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1796.0,
"completions/max_terminated_length": 1796.0,
"completions/mean_length": 513.16015625,
"completions/mean_terminated_length": 515.172607421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.04248211905360222,
"kl": 0.099151611328125,
"learning_rate": 2.0833333333333334e-06,
"loss": -0.0845,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03320443257689476,
"mask/share_reasoning": 0.8370097875595093,
"mask/share_step_conf": 0.12587954103946686,
"num_tokens": 28790469.0,
"reward": 1.1076419353485107,
"reward_std": 0.2369730919599533,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6809245944023132,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8255103826522827,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.5678446292877197,
"adv/mean_abs_reasoning": 0.4430251121520996,
"adv/mean_abs_step_conf": 0.768993616104126,
"adv/ratio_final_to_reasoning": 1.2817436612775281,
"adv/ratio_step_to_reasoning": 1.735778841900309,
"adv/std_final_conf": 0.7833918333053589,
"adv/std_reasoning": 0.7015147805213928,
"adv/std_step_conf": 0.9341139197349548,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.7835065835065835,
"calib/avg_num_step_conf": 6.734375,
"calib/ece": 0.23622406639004156,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.5352697095435685,
"calib/gap": 0.4505980595980596,
"calib/mean_conf": 0.5942323651452281,
"calib/mu_c": 0.8017692307692308,
"calib/mu_w": 0.3511711711711712,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1455186721991702,
"calib/std_conf": 0.45836296742522104,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5312717536813922,
"calib/step_q_c_n": 747.0,
"calib/step_q_gap": 0.25587769022182205,
"calib/step_q_w": 0.2753940634595702,
"calib/step_q_w_n": 977.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3018.0,
"completions/max_terminated_length": 3018.0,
"completions/mean_length": 561.328125,
"completions/mean_terminated_length": 563.5294189453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.1344,
"grad_norm": 0.038122810423374176,
"kl": 0.09313201904296875,
"learning_rate": 2.0555555555555555e-06,
"loss": 0.0491,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.03354516625404358,
"mask/share_reasoning": 0.8258533477783203,
"mask/share_step_conf": 0.1366952508687973,
"num_tokens": 29039633.0,
"reward": 1.1189892292022705,
"reward_std": 0.22997254133224487,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7145671844482422,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.822378396987915,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.6837416887283325,
"adv/mean_abs_reasoning": 0.5188707113265991,
"adv/mean_abs_step_conf": 0.7492235898971558,
"adv/ratio_final_to_reasoning": 1.3177496316571946,
"adv/ratio_step_to_reasoning": 1.4439504360953663,
"adv/std_final_conf": 0.876003623008728,
"adv/std_reasoning": 0.7755688428878784,
"adv/std_step_conf": 0.9350953698158264,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7494789915966387,
"calib/avg_num_step_conf": 7.17578125,
"calib/ece": 0.25270491803278683,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.4385245901639344,
"calib/gap": 0.3674971428571428,
"calib/mean_conf": 0.5354098360655737,
"calib/mu_c": 0.7146399999999999,
"calib/mu_w": 0.34714285714285714,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.13790983606557375,
"calib/std_conf": 0.44824365362825375,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5143576158940397,
"calib/step_q_c_n": 755.0,
"calib/step_q_gap": 0.19518016672583266,
"calib/step_q_w": 0.31917744916820706,
"calib/step_q_w_n": 1082.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2484.0,
"completions/max_terminated_length": 2484.0,
"completions/mean_length": 497.125,
"completions/mean_terminated_length": 507.02789306640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.03446248173713684,
"kl": 0.1027679443359375,
"learning_rate": 2.027777777777778e-06,
"loss": -0.0337,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.032688289880752563,
"mask/share_reasoning": 0.8081912994384766,
"mask/share_step_conf": 0.13958916068077087,
"num_tokens": 29270569.0,
"reward": 1.1078226566314697,
"reward_std": 0.2610389292240143,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.698003888130188,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8195732831954956,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.7368889451026917,
"adv/mean_abs_reasoning": 0.5929111242294312,
"adv/mean_abs_step_conf": 0.763623833656311,
"adv/ratio_final_to_reasoning": 1.24283204512376,
"adv/ratio_step_to_reasoning": 1.2879229322080006,
"adv/std_final_conf": 0.9130525588989258,
"adv/std_reasoning": 0.8429921865463257,
"adv/std_step_conf": 0.9349132776260376,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7458779191062657,
"calib/avg_num_step_conf": 5.62109375,
"calib/ece": 0.30168032786885246,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.5860655737704918,
"calib/gap": 0.3515774951208024,
"calib/mean_conf": 0.6850409836065574,
"calib/mu_c": 0.8680341880341881,
"calib/mu_w": 0.5164566929133857,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2536065573770492,
"calib/std_conf": 0.4225600141204308,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5703135888501742,
"calib/step_q_c_n": 574.0,
"calib/step_q_gap": 0.1935737044571107,
"calib/step_q_w": 0.37673988439306355,
"calib/step_q_w_n": 865.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2515.0,
"completions/max_terminated_length": 2515.0,
"completions/mean_length": 516.453125,
"completions/mean_terminated_length": 518.4784545898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.0671505406498909,
"kl": 0.097900390625,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.025,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.035544052720069885,
"mask/share_reasoning": 0.8391733169555664,
"mask/share_step_conf": 0.1213764026761055,
"num_tokens": 29509445.0,
"reward": 1.0835953950881958,
"reward_std": 0.3216124475002289,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.6603434085845947,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.817585825920105,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.6601653099060059,
"adv/mean_abs_reasoning": 0.5416503548622131,
"adv/mean_abs_step_conf": 0.7622501850128174,
"adv/ratio_final_to_reasoning": 1.2188034291493097,
"adv/ratio_step_to_reasoning": 1.407273489568231,
"adv/std_final_conf": 0.8547391295433044,
"adv/std_reasoning": 0.7928169965744019,
"adv/std_step_conf": 0.9353224635124207,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7006404810143128,
"calib/avg_num_step_conf": 6.42578125,
"calib/ece": 0.2701200000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.568,
"calib/gap": 0.28760930658126915,
"calib/mean_conf": 0.6901200000000001,
"calib/mu_c": 0.8132167832167831,
"calib/mu_w": 0.525607476635514,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1941200000000001,
"calib/std_conf": 0.4056850818060728,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5494588500563697,
"calib/step_q_c_n": 887.0,
"calib/step_q_gap": 0.1163981640405386,
"calib/step_q_w": 0.43306068601583114,
"calib/step_q_w_n": 758.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2895.0,
"completions/max_terminated_length": 2895.0,
"completions/mean_length": 478.265625,
"completions/mean_terminated_length": 478.265625,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.1376,
"grad_norm": 0.03564068675041199,
"kl": 0.1029205322265625,
"learning_rate": 1.9722222222222224e-06,
"loss": 0.0749,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03495129942893982,
"mask/share_reasoning": 0.8219506740570068,
"mask/share_step_conf": 0.14309805631637573,
"num_tokens": 29734265.0,
"reward": 1.1189069747924805,
"reward_std": 0.2425338625907898,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7006582021713257,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8200829029083252,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.5851052403450012,
"adv/mean_abs_reasoning": 0.38511407375335693,
"adv/mean_abs_step_conf": 0.7399400472640991,
"adv/ratio_final_to_reasoning": 1.5193037082299072,
"adv/ratio_step_to_reasoning": 1.9213529125346571,
"adv/std_final_conf": 0.8160495758056641,
"adv/std_reasoning": 0.6612229943275452,
"adv/std_step_conf": 0.934004545211792,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7913043478260868,
"calib/avg_num_step_conf": 5.41015625,
"calib/ece": 0.18928286852589649,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6095617529880478,
"calib/gap": 0.46168668046928907,
"calib/mean_conf": 0.7056972111553784,
"calib/mu_c": 0.8712422360248446,
"calib/mu_w": 0.40955555555555556,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12677290836653396,
"calib/std_conf": 0.4146959380894869,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5991325898389095,
"calib/step_q_c_n": 807.0,
"calib/step_q_gap": 0.14627791855863265,
"calib/step_q_w": 0.45285467128027684,
"calib/step_q_w_n": 578.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2941.0,
"completions/max_terminated_length": 2941.0,
"completions/mean_length": 437.25390625,
"completions/mean_terminated_length": 438.9686584472656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.03557712584733963,
"kl": 0.1061553955078125,
"learning_rate": 1.944444444444445e-06,
"loss": 0.0019,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03761007636785507,
"mask/share_reasoning": 0.8260776996612549,
"mask/share_step_conf": 0.13240596652030945,
"num_tokens": 29951490.0,
"reward": 1.198660135269165,
"reward_std": 0.19062399864196777,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7905261516571045,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8571335077285767,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.620849609375,
"adv/mean_abs_reasoning": 0.4095512628555298,
"adv/mean_abs_step_conf": 0.7489230632781982,
"adv/ratio_final_to_reasoning": 1.5159264924401081,
"adv/ratio_step_to_reasoning": 1.8286430325143026,
"adv/std_final_conf": 0.8143406510353088,
"adv/std_reasoning": 0.7012926340103149,
"adv/std_step_conf": 0.9351503849029541,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8083251392985906,
"calib/avg_num_step_conf": 6.41015625,
"calib/ece": 0.23830645161290326,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.4838709677419355,
"calib/gap": 0.44567944936086534,
"calib/mean_conf": 0.5695161290322581,
"calib/mu_c": 0.8121238938053098,
"calib/mu_w": 0.36644444444444446,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.17608870967741935,
"calib/std_conf": 0.4515865487701963,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5597693574958814,
"calib/step_q_c_n": 607.0,
"calib/step_q_gap": 0.2432606534339859,
"calib/step_q_w": 0.3165087040618955,
"calib/step_q_w_n": 1034.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2860.0,
"completions/max_terminated_length": 2860.0,
"completions/mean_length": 485.3984375,
"completions/mean_terminated_length": 487.302001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.04578558728098869,
"kl": 0.099395751953125,
"learning_rate": 1.916666666666667e-06,
"loss": 0.0238,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.034842804074287415,
"mask/share_reasoning": 0.8241904377937317,
"mask/share_step_conf": 0.1370604932308197,
"num_tokens": 30181960.0,
"reward": 1.127051591873169,
"reward_std": 0.20851066708564758,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.7286202907562256,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8294886350631714,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.5589631795883179,
"adv/mean_abs_reasoning": 0.438191294670105,
"adv/mean_abs_step_conf": 0.7336174249649048,
"adv/ratio_final_to_reasoning": 1.275614523581845,
"adv/ratio_step_to_reasoning": 1.67419443035082,
"adv/std_final_conf": 0.8089057803153992,
"adv/std_reasoning": 0.7013886570930481,
"adv/std_step_conf": 0.9349938631057739,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8533506831489914,
"calib/avg_num_step_conf": 6.2578125,
"calib/ece": 0.183784860557769,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6055776892430279,
"calib/gap": 0.5183786597267406,
"calib/mean_conf": 0.6826693227091635,
"calib/mu_c": 0.9015862068965519,
"calib/mu_w": 0.3832075471698113,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14438247011952196,
"calib/std_conf": 0.4299805960311306,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.616969696969697,
"calib/step_q_c_n": 792.0,
"calib/step_q_gap": 0.270364758698092,
"calib/step_q_w": 0.34660493827160493,
"calib/step_q_w_n": 810.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3071.0,
"completions/max_terminated_length": 3071.0,
"completions/mean_length": 482.35546875,
"completions/mean_terminated_length": 484.2471008300781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.1408,
"grad_norm": 0.047956615686416626,
"kl": 0.098297119140625,
"learning_rate": 1.888888888888889e-06,
"loss": -0.0322,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03540698066353798,
"mask/share_reasoning": 0.8221828937530518,
"mask/share_step_conf": 0.1385037899017334,
"num_tokens": 30411035.0,
"reward": 1.1885790824890137,
"reward_std": 0.2194693386554718,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7971832156181335,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.846545934677124,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.6864287853240967,
"adv/mean_abs_reasoning": 0.5413827896118164,
"adv/mean_abs_step_conf": 0.7185360193252563,
"adv/ratio_final_to_reasoning": 1.2679176333187125,
"adv/ratio_step_to_reasoning": 1.3272236079770152,
"adv/std_final_conf": 0.8655786514282227,
"adv/std_reasoning": 0.8099425435066223,
"adv/std_step_conf": 0.9355461001396179,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7693362193362193,
"calib/avg_num_step_conf": 6.8046875,
"calib/ece": 0.26643032786885257,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.4344262295081967,
"calib/gap": 0.45164862914862913,
"calib/mean_conf": 0.5078319672131147,
"calib/mu_c": 0.7928888888888889,
"calib/mu_w": 0.34124025974025973,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.20270491803278695,
"calib/std_conf": 0.46393697508061343,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.49898576512455517,
"calib/step_q_c_n": 562.0,
"calib/step_q_gap": 0.1589264430906569,
"calib/step_q_w": 0.34005932203389827,
"calib/step_q_w_n": 1180.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2861.0,
"completions/max_terminated_length": 2861.0,
"completions/mean_length": 527.63671875,
"completions/mean_terminated_length": 544.6572265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.040555987507104874,
"kl": 0.095001220703125,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.1968,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03197144716978073,
"mask/share_reasoning": 0.8133314847946167,
"mask/share_step_conf": 0.12344710528850555,
"num_tokens": 30652454.0,
"reward": 1.107369065284729,
"reward_std": 0.289519727230072,
"rewards/accuracy_reward_step": 0.3515625,
"rewards/final_brier_reward_step": 0.7042034268379211,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8335855603218079,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.6914292573928833,
"adv/mean_abs_reasoning": 0.608074963092804,
"adv/mean_abs_step_conf": 0.7426619529724121,
"adv/ratio_final_to_reasoning": 1.1370789776907126,
"adv/ratio_step_to_reasoning": 1.2213328915814408,
"adv/std_final_conf": 0.8574259281158447,
"adv/std_reasoning": 0.8431612849235535,
"adv/std_step_conf": 0.9358600974082947,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.7793327678823863,
"calib/avg_num_step_conf": 5.9609375,
"calib/ece": 0.2264016736401674,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.4811715481171548,
"calib/gap": 0.4349752615210632,
"calib/mean_conf": 0.5857322175732217,
"calib/mu_c": 0.782290076335878,
"calib/mu_w": 0.3473148148148148,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.13200836820083683,
"calib/std_conf": 0.4550141239332735,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.48848167539267023,
"calib/step_q_c_n": 764.0,
"calib/step_q_gap": 0.1779829877286282,
"calib/step_q_w": 0.31049868766404204,
"calib/step_q_w_n": 762.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3040.0,
"completions/max_terminated_length": 3040.0,
"completions/mean_length": 554.38671875,
"completions/mean_terminated_length": 558.751953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.032993435859680176,
"kl": 0.0858001708984375,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.0314,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.03068811446428299,
"mask/share_reasoning": 0.8482072949409485,
"mask/share_step_conf": 0.11329209804534912,
"num_tokens": 30903329.0,
"reward": 1.092668056488037,
"reward_std": 0.33639949560165405,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6982929706573486,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.7996953725814819,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.6794062852859497,
"adv/mean_abs_reasoning": 0.5256780385971069,
"adv/mean_abs_step_conf": 0.7526997923851013,
"adv/ratio_final_to_reasoning": 1.2924380236600754,
"adv/ratio_step_to_reasoning": 1.4318646340902015,
"adv/std_final_conf": 0.8573894500732422,
"adv/std_reasoning": 0.7755757570266724,
"adv/std_step_conf": 0.9348970651626587,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6805479452054795,
"calib/avg_num_step_conf": 6.73828125,
"calib/ece": 0.28081300813008125,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.4146341463414634,
"calib/gap": 0.30192739726027396,
"calib/mean_conf": 0.5482926829268292,
"calib/mu_c": 0.6710273972602739,
"calib/mu_w": 0.3691,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.11780487804878045,
"calib/std_conf": 0.4409085359810263,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4391374527112232,
"calib/step_q_c_n": 793.0,
"calib/step_q_gap": 0.14010311794727465,
"calib/step_q_w": 0.29903433476394853,
"calib/step_q_w_n": 932.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2771.0,
"completions/max_terminated_length": 2771.0,
"completions/mean_length": 514.8828125,
"completions/mean_terminated_length": 525.1394653320312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.144,
"grad_norm": 0.04568664729595184,
"kl": 0.090484619140625,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.0877,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.0343073233962059,
"mask/share_reasoning": 0.8224637508392334,
"mask/share_step_conf": 0.12369771301746368,
"num_tokens": 31141019.0,
"reward": 1.1282449960708618,
"reward_std": 0.2306542545557022,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6699097752571106,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8551160097122192,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.5642716884613037,
"adv/mean_abs_reasoning": 0.4450831115245819,
"adv/mean_abs_step_conf": 0.7536429762840271,
"adv/ratio_final_to_reasoning": 1.2677894843694582,
"adv/ratio_step_to_reasoning": 1.6932634754495812,
"adv/std_final_conf": 0.7953715920448303,
"adv/std_reasoning": 0.7014799118041992,
"adv/std_step_conf": 0.9343529939651489,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.8631752305665348,
"calib/avg_num_step_conf": 6.421875,
"calib/ece": 0.1445564516129032,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.31048387096774194,
"calib/gap": 0.6184361001317522,
"calib/mean_conf": 0.3692338709677419,
"calib/mu_c": 0.7133636363636363,
"calib/mu_w": 0.09492753623188407,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.03512096774193546,
"calib/std_conf": 0.44941518645958045,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.45343076923076925,
"calib/step_q_c_n": 650.0,
"calib/step_q_gap": 0.20295793220863645,
"calib/step_q_w": 0.2504728370221328,
"calib/step_q_w_n": 994.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1826.0,
"completions/max_terminated_length": 1826.0,
"completions/mean_length": 483.77734375,
"completions/mean_terminated_length": 483.77734375,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.04788472130894661,
"kl": 0.1029815673828125,
"learning_rate": 1.777777777777778e-06,
"loss": -0.0129,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03419430926442146,
"mask/share_reasoning": 0.8263726234436035,
"mask/share_step_conf": 0.13943305611610413,
"num_tokens": 31373354.0,
"reward": 1.2139787673950195,
"reward_std": 0.17367833852767944,
"rewards/accuracy_reward_step": 0.4296875,
"rewards/final_brier_reward_step": 0.8243738412857056,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8825975060462952,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.6119123697280884,
"adv/mean_abs_reasoning": 0.45151853561401367,
"adv/mean_abs_step_conf": 0.7595447897911072,
"adv/ratio_final_to_reasoning": 1.3552320037004846,
"adv/ratio_step_to_reasoning": 1.6822006847586288,
"adv/std_final_conf": 0.8421707153320312,
"adv/std_reasoning": 0.7206130623817444,
"adv/std_step_conf": 0.9349687695503235,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.788516129032258,
"calib/avg_num_step_conf": 6.859375,
"calib/ece": 0.20839357429718883,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.3855421686746988,
"calib/gap": 0.47698516129032253,
"calib/mean_conf": 0.4663052208835341,
"calib/mu_c": 0.7038399999999999,
"calib/mu_w": 0.2268548387096774,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.0863453815261045,
"calib/std_conf": 0.4581734330927749,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4103771131339402,
"calib/step_q_c_n": 769.0,
"calib/step_q_gap": 0.12516941303262308,
"calib/step_q_w": 0.28520770010131713,
"calib/step_q_w_n": 987.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2288.0,
"completions/max_terminated_length": 2288.0,
"completions/mean_length": 481.43359375,
"completions/mean_terminated_length": 485.2243957519531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.04561835899949074,
"kl": 0.1042022705078125,
"learning_rate": 1.75e-06,
"loss": -0.077,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03354161977767944,
"mask/share_reasoning": 0.8171862959861755,
"mask/share_step_conf": 0.14145955443382263,
"num_tokens": 31603585.0,
"reward": 1.181359052658081,
"reward_std": 0.20482224225997925,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7560410499572754,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8763262033462524,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.611071765422821,
"adv/mean_abs_reasoning": 0.5744329690933228,
"adv/mean_abs_step_conf": 0.7322912216186523,
"adv/ratio_final_to_reasoning": 1.063782544353832,
"adv/ratio_step_to_reasoning": 1.2748070898063022,
"adv/std_final_conf": 0.8569561839103699,
"adv/std_reasoning": 0.8266201615333557,
"adv/std_step_conf": 0.9349929690361023,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7623496107572542,
"calib/avg_num_step_conf": 6.609375,
"calib/ece": 0.2608906882591093,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.39271255060728744,
"calib/gap": 0.42322717622080674,
"calib/mean_conf": 0.48234817813765185,
"calib/mu_c": 0.6365605095541401,
"calib/mu_w": 0.21333333333333337,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.05380566801619439,
"calib/std_conf": 0.4529741484641731,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4167561761546724,
"calib/step_q_c_n": 931.0,
"calib/step_q_gap": 0.1608954665620312,
"calib/step_q_w": 0.25586070959264123,
"calib/step_q_w_n": 761.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2432.0,
"completions/max_terminated_length": 2432.0,
"completions/mean_length": 495.27734375,
"completions/mean_terminated_length": 499.1771545410156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1472,
"grad_norm": 0.04322437196969986,
"kl": 0.100189208984375,
"learning_rate": 1.7222222222222224e-06,
"loss": 0.0086,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03365977108478546,
"mask/share_reasoning": 0.8261804580688477,
"mask/share_step_conf": 0.1323472559452057,
"num_tokens": 31834712.0,
"reward": 1.139789342880249,
"reward_std": 0.22284376621246338,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7059851884841919,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8386456370353699,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.6286588907241821,
"adv/mean_abs_reasoning": 0.40157026052474976,
"adv/mean_abs_step_conf": 0.7390480041503906,
"adv/ratio_final_to_reasoning": 1.565501613348273,
"adv/ratio_step_to_reasoning": 1.8403952603079812,
"adv/std_final_conf": 0.8444784879684448,
"adv/std_reasoning": 0.681614339351654,
"adv/std_step_conf": 0.9344739317893982,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7652531517562191,
"calib/avg_num_step_conf": 6.0234375,
"calib/ece": 0.2688582677165354,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2755905511811024,
"calib/gap": 0.3859131665880132,
"calib/mean_conf": 0.4299606299212599,
"calib/mu_c": 0.5682208588957055,
"calib/mu_w": 0.1823076923076923,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.028543307086614178,
"calib/std_conf": 0.42258158419456776,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.39593394077448746,
"calib/step_q_c_n": 878.0,
"calib/step_q_gap": 0.1422441817383429,
"calib/step_q_w": 0.25368975903614455,
"calib/step_q_w_n": 664.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2835.0,
"completions/max_terminated_length": 2835.0,
"completions/mean_length": 468.63671875,
"completions/mean_terminated_length": 468.63671875,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.06251625716686249,
"kl": 0.10687255859375,
"learning_rate": 1.6944444444444446e-06,
"loss": 0.1717,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.037038594484329224,
"mask/share_reasoning": 0.8287454843521118,
"mask/share_step_conf": 0.134215846657753,
"num_tokens": 32057779.0,
"reward": 1.1774184703826904,
"reward_std": 0.18099024891853333,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7145543098449707,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8768551349639893,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.5951710939407349,
"adv/mean_abs_reasoning": 0.4744147062301636,
"adv/mean_abs_step_conf": 0.7358417510986328,
"adv/ratio_final_to_reasoning": 1.254537614717167,
"adv/ratio_step_to_reasoning": 1.5510517305541478,
"adv/std_final_conf": 0.816806972026825,
"adv/std_reasoning": 0.7207441329956055,
"adv/std_step_conf": 0.93478924036026,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7961909224795822,
"calib/avg_num_step_conf": 5.77734375,
"calib/ece": 0.20410358565737058,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.4940239043824701,
"calib/gap": 0.4661393760878296,
"calib/mean_conf": 0.5888844621513944,
"calib/mu_c": 0.769025974025974,
"calib/mu_w": 0.3028865979381443,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.08972111553784864,
"calib/std_conf": 0.4461374185301638,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.47370283018867926,
"calib/step_q_c_n": 848.0,
"calib/step_q_gap": 0.18343341655951922,
"calib/step_q_w": 0.29026941362916003,
"calib/step_q_w_n": 631.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2434.0,
"completions/max_terminated_length": 2434.0,
"completions/mean_length": 485.96875,
"completions/mean_terminated_length": 485.96875,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.06402932852506638,
"kl": 0.098541259765625,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0951,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03483884036540985,
"mask/share_reasoning": 0.8365387916564941,
"mask/share_step_conf": 0.12862235307693481,
"num_tokens": 32287203.0,
"reward": 1.1847764253616333,
"reward_std": 0.2182726263999939,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7572652101516724,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8654833436012268,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.5650936961174011,
"adv/mean_abs_reasoning": 0.4365852475166321,
"adv/mean_abs_step_conf": 0.7492316961288452,
"adv/ratio_final_to_reasoning": 1.2943490402658955,
"adv/ratio_step_to_reasoning": 1.7161177579650182,
"adv/std_final_conf": 0.8094077706336975,
"adv/std_reasoning": 0.7205508351325989,
"adv/std_step_conf": 0.9348444938659668,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8673163418290855,
"calib/avg_num_step_conf": 6.29296875,
"calib/ece": 0.12342741935483872,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5362903225806451,
"calib/gap": 0.6508752766473904,
"calib/mean_conf": 0.6141532258064516,
"calib/mu_c": 0.8424844720496893,
"calib/mu_w": 0.19160919540229887,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.04419354838709679,
"calib/std_conf": 0.4500027875246072,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4526805251641138,
"calib/step_q_c_n": 914.0,
"calib/step_q_gap": 0.19392873176382686,
"calib/step_q_w": 0.2587517934002869,
"calib/step_q_w_n": 697.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2979.0,
"completions/max_terminated_length": 2979.0,
"completions/mean_length": 513.63671875,
"completions/mean_terminated_length": 515.6510009765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.1504,
"grad_norm": 0.04482351988554001,
"kl": 0.0861968994140625,
"learning_rate": 1.638888888888889e-06,
"loss": 0.0588,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.032881513237953186,
"mask/share_reasoning": 0.8362753987312317,
"mask/share_step_conf": 0.12693683803081512,
"num_tokens": 32525790.0,
"reward": 1.232285737991333,
"reward_std": 0.1995995044708252,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.8379597663879395,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8713870644569397,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.6428453922271729,
"adv/mean_abs_reasoning": 0.5539547801017761,
"adv/mean_abs_step_conf": 0.7590426206588745,
"adv/ratio_final_to_reasoning": 1.160465466349194,
"adv/ratio_step_to_reasoning": 1.3702248774158394,
"adv/std_final_conf": 0.8463671803474426,
"adv/std_reasoning": 0.8099223971366882,
"adv/std_step_conf": 0.9345756769180298,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.804326923076923,
"calib/avg_num_step_conf": 6.56640625,
"calib/ece": 0.18180000000000002,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.38,
"calib/gap": 0.4997435897435897,
"calib/mean_conf": 0.4782,
"calib/mu_c": 0.7180769230769231,
"calib/mu_w": 0.21833333333333335,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.07000000000000002,
"calib/std_conf": 0.4522251209298307,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4578113663845224,
"calib/step_q_c_n": 827.0,
"calib/step_q_gap": 0.15025867317609148,
"calib/step_q_w": 0.3075526932084309,
"calib/step_q_w_n": 854.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2665.0,
"completions/max_terminated_length": 2665.0,
"completions/mean_length": 510.1796875,
"completions/mean_terminated_length": 512.180419921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.03714209049940109,
"kl": 0.0925140380859375,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.0994,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03342527151107788,
"mask/share_reasoning": 0.8303429484367371,
"mask/share_step_conf": 0.13232550024986267,
"num_tokens": 32761556.0,
"reward": 1.1912291049957275,
"reward_std": 0.20463573932647705,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7750167846679688,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8737108707427979,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.6427560448646545,
"adv/mean_abs_reasoning": 0.5893682241439819,
"adv/mean_abs_step_conf": 0.7372866868972778,
"adv/ratio_final_to_reasoning": 1.090584830558544,
"adv/ratio_step_to_reasoning": 1.250978007794936,
"adv/std_final_conf": 0.8869020938873291,
"adv/std_reasoning": 0.8429052829742432,
"adv/std_step_conf": 0.9352031946182251,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.8430587739625026,
"calib/avg_num_step_conf": 6.42578125,
"calib/ece": 0.16661157024793383,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.4256198347107438,
"calib/gap": 0.5487072537040938,
"calib/mean_conf": 0.5290082644628099,
"calib/mu_c": 0.7580141843971631,
"calib/mu_w": 0.20930693069306933,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.056487603305785075,
"calib/std_conf": 0.45419150757310883,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.42411154345006485,
"calib/step_q_c_n": 771.0,
"calib/step_q_gap": 0.16634266473152937,
"calib/step_q_w": 0.2577688787185355,
"calib/step_q_w_n": 874.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2473.0,
"completions/max_terminated_length": 2473.0,
"completions/mean_length": 519.203125,
"completions/mean_terminated_length": 523.2913208007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.047295790165662766,
"kl": 0.09334564208984375,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.0946,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.03189526125788689,
"mask/share_reasoning": 0.8289496898651123,
"mask/share_step_conf": 0.13134250044822693,
"num_tokens": 33001808.0,
"reward": 1.1662962436676025,
"reward_std": 0.2495536059141159,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.769977331161499,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.8422641754150391,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.537711501121521,
"adv/mean_abs_reasoning": 0.48254498839378357,
"adv/mean_abs_step_conf": 0.7453676462173462,
"adv/ratio_final_to_reasoning": 1.1143240818050284,
"adv/ratio_step_to_reasoning": 1.5446593875078953,
"adv/std_final_conf": 0.7811760902404785,
"adv/std_reasoning": 0.7392632365226746,
"adv/std_step_conf": 0.9346184730529785,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7303717575253156,
"calib/avg_num_step_conf": 5.83984375,
"calib/ece": 0.26418326693227084,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5139442231075697,
"calib/gap": 0.41801359411846306,
"calib/mean_conf": 0.5696812749003984,
"calib/mu_c": 0.7179012345679012,
"calib/mu_w": 0.29988764044943816,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.09422310756972105,
"calib/std_conf": 0.46778436182848504,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.44524625267665957,
"calib/step_q_c_n": 934.0,
"calib/step_q_gap": 0.1155849336035758,
"calib/step_q_w": 0.32966131907308377,
"calib/step_q_w_n": 561.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1510.0,
"completions/max_terminated_length": 1510.0,
"completions/mean_length": 475.97265625,
"completions/mean_terminated_length": 477.8392333984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.1536,
"grad_norm": 0.07732080668210983,
"kl": 0.141204833984375,
"learning_rate": 1.5555555555555558e-06,
"loss": -0.0174,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.035442303866147995,
"mask/share_reasoning": 0.8312504291534424,
"mask/share_step_conf": 0.12940102815628052,
"num_tokens": 33227785.0,
"reward": 1.1550257205963135,
"reward_std": 0.18985150754451752,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7156910300254822,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8488442301750183,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.6415660977363586,
"adv/mean_abs_reasoning": 0.5276660323143005,
"adv/mean_abs_step_conf": 0.7385779619216919,
"adv/ratio_final_to_reasoning": 1.2158563531605429,
"adv/ratio_step_to_reasoning": 1.3997072327781812,
"adv/std_final_conf": 0.8449078798294067,
"adv/std_reasoning": 0.7753786444664001,
"adv/std_step_conf": 0.9349455237388611,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7155692729766804,
"calib/avg_num_step_conf": 6.47265625,
"calib/ece": 0.24250000000000005,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.47619047619047616,
"calib/gap": 0.30662962962962975,
"calib/mean_conf": 0.623452380952381,
"calib/mu_c": 0.732962962962963,
"calib/mu_w": 0.4263333333333333,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.11154761904761909,
"calib/std_conf": 0.4225334420341048,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4739775051124744,
"calib/step_q_c_n": 978.0,
"calib/step_q_gap": 0.15094363176932268,
"calib/step_q_w": 0.3230338733431517,
"calib/step_q_w_n": 679.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3004.0,
"completions/max_terminated_length": 3004.0,
"completions/mean_length": 456.296875,
"completions/mean_terminated_length": 456.296875,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.03158734366297722,
"kl": 0.0939178466796875,
"learning_rate": 1.527777777777778e-06,
"loss": 0.0975,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.037699561566114426,
"mask/share_reasoning": 0.8159480094909668,
"mask/share_step_conf": 0.1463523805141449,
"num_tokens": 33447301.0,
"reward": 1.1692979335784912,
"reward_std": 0.21003535389900208,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7208542823791504,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.862869381904602,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.6607988476753235,
"adv/mean_abs_reasoning": 0.46970364451408386,
"adv/mean_abs_step_conf": 0.729251503944397,
"adv/ratio_final_to_reasoning": 1.406842070299307,
"adv/ratio_step_to_reasoning": 1.5525779126087464,
"adv/std_final_conf": 0.8592219352722168,
"adv/std_reasoning": 0.7394216656684875,
"adv/std_step_conf": 0.9346105456352234,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7792182662538699,
"calib/avg_num_step_conf": 6.2734375,
"calib/ece": 0.24348000000000009,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.512,
"calib/gap": 0.46127708978328164,
"calib/mean_conf": 0.6119600000000001,
"calib/mu_c": 0.8628947368421052,
"calib/mu_w": 0.4016176470588235,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.19972000000000006,
"calib/std_conf": 0.44000972534706545,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4796870342771982,
"calib/step_q_c_n": 671.0,
"calib/step_q_gap": 0.15754799684404308,
"calib/step_q_w": 0.32213903743315514,
"calib/step_q_w_n": 935.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2497.0,
"completions/max_terminated_length": 2497.0,
"completions/mean_length": 502.37890625,
"completions/mean_terminated_length": 504.34906005859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.04502442106604576,
"kl": 0.08422088623046875,
"learning_rate": 1.5e-06,
"loss": -0.1369,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.033101823180913925,
"mask/share_reasoning": 0.8348150253295898,
"mask/share_step_conf": 0.12817689776420593,
"num_tokens": 33683126.0,
"reward": 1.1563743352890015,
"reward_std": 0.23182733356952667,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.7413148283958435,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8585601449012756,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.6285818219184875,
"adv/mean_abs_reasoning": 0.5205052495002747,
"adv/mean_abs_step_conf": 0.729662299156189,
"adv/ratio_final_to_reasoning": 1.2076378144542725,
"adv/ratio_step_to_reasoning": 1.4018346594135627,
"adv/std_final_conf": 0.8171048164367676,
"adv/std_reasoning": 0.7753753662109375,
"adv/std_step_conf": 0.9347905516624451,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7239484396200814,
"calib/avg_num_step_conf": 7.15234375,
"calib/ece": 0.33013661202185796,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.6311475409836066,
"calib/gap": 0.3178037087290819,
"calib/mean_conf": 0.6843169398907105,
"calib/mu_c": 0.8588484848484849,
"calib/mu_w": 0.5410447761194029,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2818169398907104,
"calib/std_conf": 0.43229103073089625,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5687556904400608,
"calib/step_q_c_n": 659.0,
"calib/step_q_gap": 0.2364263389042246,
"calib/step_q_w": 0.3323293515358362,
"calib/step_q_w_n": 1172.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1881.0,
"completions/max_terminated_length": 1881.0,
"completions/mean_length": 505.43359375,
"completions/mean_terminated_length": 513.4563598632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.1568,
"grad_norm": 0.03567939251661301,
"kl": 0.111602783203125,
"learning_rate": 1.4722222222222225e-06,
"loss": -0.0141,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03253734111785889,
"mask/share_reasoning": 0.8155205845832825,
"mask/share_step_conf": 0.13631707429885864,
"num_tokens": 33916197.0,
"reward": 1.082091212272644,
"reward_std": 0.2596330940723419,
"rewards/accuracy_reward_step": 0.4296875,
"rewards/final_brier_reward_step": 0.6370561122894287,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8337092399597168,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.5409537553787231,
"adv/mean_abs_reasoning": 0.5042853355407715,
"adv/mean_abs_step_conf": 0.7518453598022461,
"adv/ratio_final_to_reasoning": 1.072713635026944,
"adv/ratio_step_to_reasoning": 1.4909125981147222,
"adv/std_final_conf": 0.776544988155365,
"adv/std_reasoning": 0.7576196789741516,
"adv/std_step_conf": 0.9346625804901123,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.8059624673370813,
"calib/avg_num_step_conf": 6.38671875,
"calib/ece": 0.15252032520325212,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.7073170731707317,
"calib/gap": 0.43957558001425295,
"calib/mean_conf": 0.7866666666666666,
"calib/mu_c": 0.9171098265895954,
"calib/mu_w": 0.47753424657534244,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.11796747967479684,
"calib/std_conf": 0.3661015600943583,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5194146100691016,
"calib/step_q_c_n": 1013.0,
"calib/step_q_gap": 0.19250142678935883,
"calib/step_q_w": 0.3269131832797428,
"calib/step_q_w_n": 622.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1737.0,
"completions/max_terminated_length": 1737.0,
"completions/mean_length": 467.7109375,
"completions/mean_terminated_length": 471.3937072753906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.039492230862379074,
"kl": 0.08855438232421875,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.091,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03712426871061325,
"mask/share_reasoning": 0.8118203282356262,
"mask/share_step_conf": 0.14324289560317993,
"num_tokens": 34141043.0,
"reward": 1.1953483819961548,
"reward_std": 0.2314508557319641,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.8012219071388245,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8414207696914673,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.6707220077514648,
"adv/mean_abs_reasoning": 0.5681421756744385,
"adv/mean_abs_step_conf": 0.7494305372238159,
"adv/ratio_final_to_reasoning": 1.180553101792266,
"adv/ratio_step_to_reasoning": 1.319089779480235,
"adv/std_final_conf": 0.8762696385383606,
"adv/std_reasoning": 0.7928650379180908,
"adv/std_step_conf": 0.9343019127845764,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.866640153774773,
"calib/avg_num_step_conf": 6.67578125,
"calib/ece": 0.2108064516129033,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6733870967741935,
"calib/gap": 0.46435341684894277,
"calib/mean_conf": 0.7583064516129032,
"calib/mu_c": 0.9586524822695035,
"calib/mu_w": 0.49429906542056073,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2002822580645162,
"calib/std_conf": 0.3839937032711931,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5505997552019585,
"calib/step_q_c_n": 817.0,
"calib/step_q_gap": 0.20829482246653247,
"calib/step_q_w": 0.342304932735426,
"calib/step_q_w_n": 892.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3055.0,
"completions/max_terminated_length": 3055.0,
"completions/mean_length": 536.609375,
"completions/mean_terminated_length": 540.8346557617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.058882106095552444,
"kl": 0.0788726806640625,
"learning_rate": 1.4166666666666667e-06,
"loss": -0.033,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.034434471279382706,
"mask/share_reasoning": 0.82708740234375,
"mask/share_step_conf": 0.1306656002998352,
"num_tokens": 34382871.0,
"reward": 1.1861321926116943,
"reward_std": 0.2548786401748657,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7701757550239563,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8659757375717163,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.6370327472686768,
"adv/mean_abs_reasoning": 0.47151249647140503,
"adv/mean_abs_step_conf": 0.742438554763794,
"adv/ratio_final_to_reasoning": 1.3510410689768637,
"adv/ratio_step_to_reasoning": 1.5745893487868974,
"adv/std_final_conf": 0.85017329454422,
"adv/std_reasoning": 0.7206061482429504,
"adv/std_step_conf": 0.9352177381515503,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7956780538302277,
"calib/avg_num_step_conf": 6.5078125,
"calib/ece": 0.27388000000000007,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.736,
"calib/gap": 0.3759510869565218,
"calib/mean_conf": 0.7894,
"calib/mu_c": 0.9578260869565218,
"calib/mu_w": 0.581875,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2556400000000001,
"calib/std_conf": 0.3788256063150959,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5579088471849867,
"calib/step_q_c_n": 746.0,
"calib/step_q_gap": 0.1947131950110736,
"calib/step_q_w": 0.3631956521739131,
"calib/step_q_w_n": 920.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2666.0,
"completions/max_terminated_length": 2666.0,
"completions/mean_length": 435.05859375,
"completions/mean_terminated_length": 438.4842529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.16,
"grad_norm": 0.0466759093105793,
"kl": 0.09934234619140625,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.0764,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.04131307825446129,
"mask/share_reasoning": 0.7998567819595337,
"mask/share_step_conf": 0.15101763606071472,
"num_tokens": 34599206.0,
"reward": 1.1474685668945312,
"reward_std": 0.21869435906410217,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.721463680267334,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8468990325927734,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.6580295562744141,
"adv/mean_abs_reasoning": 0.5155429244041443,
"adv/mean_abs_step_conf": 0.7393295764923096,
"adv/ratio_final_to_reasoning": 1.2763817038803382,
"adv/ratio_step_to_reasoning": 1.4340795722234265,
"adv/std_final_conf": 0.8595101833343506,
"adv/std_reasoning": 0.7755288481712341,
"adv/std_step_conf": 0.9352730512619019,
"calib/answer_extract_rate": 0.92578125,
"calib/auroc": 0.7508652963198418,
"calib/avg_num_step_conf": 6.53125,
"calib/ece": 0.28424369747899164,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.6092436974789915,
"calib/gap": 0.37555484919121285,
"calib/mean_conf": 0.6671008403361344,
"calib/mu_c": 0.8580341880341881,
"calib/mu_w": 0.4824793388429752,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.22987394957983198,
"calib/std_conf": 0.43708497519991735,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.514,
"calib/step_q_c_n": 640.0,
"calib/step_q_gap": 0.16983333333333334,
"calib/step_q_w": 0.3441666666666667,
"calib/step_q_w_n": 1032.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2803.0,
"completions/max_terminated_length": 2803.0,
"completions/mean_length": 556.33203125,
"completions/mean_terminated_length": 562.9288940429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.032359592616558075,
"kl": 0.07588958740234375,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.07,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.03231126070022583,
"mask/share_reasoning": 0.835185170173645,
"mask/share_step_conf": 0.12078479677438736,
"num_tokens": 34848651.0,
"reward": 1.0490763187408447,
"reward_std": 0.29749518632888794,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.6578387022018433,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l2_reward": 0.7779176235198975,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.6492472887039185,
"adv/mean_abs_reasoning": 0.5543559193611145,
"adv/mean_abs_step_conf": 0.7531231641769409,
"adv/ratio_final_to_reasoning": 1.1711740887554056,
"adv/ratio_step_to_reasoning": 1.358555285284772,
"adv/std_final_conf": 0.8588571548461914,
"adv/std_reasoning": 0.7929433584213257,
"adv/std_step_conf": 0.9349693655967712,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7679598662207359,
"calib/avg_num_step_conf": 6.58984375,
"calib/ece": 0.2981999999999998,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.6122448979591837,
"calib/gap": 0.32719832775919744,
"calib/mean_conf": 0.7059632653061224,
"calib/mu_c": 0.859546153846154,
"calib/mu_w": 0.5323478260869565,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.23677551020408147,
"calib/std_conf": 0.4136108230405445,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.47160194174757286,
"calib/step_q_c_n": 824.0,
"calib/step_q_gap": 0.12535396955753814,
"calib/step_q_w": 0.3462479721900347,
"calib/step_q_w_n": 863.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2701.0,
"completions/max_terminated_length": 2701.0,
"completions/mean_length": 491.62890625,
"completions/mean_terminated_length": 491.62890625,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.04753780737519264,
"kl": 0.08827972412109375,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.0528,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.035858429968357086,
"mask/share_reasoning": 0.8217682242393494,
"mask/share_step_conf": 0.14237335324287415,
"num_tokens": 35079900.0,
"reward": 1.095044732093811,
"reward_std": 0.28394240140914917,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6688538193702698,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.8204070925712585,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.6669121980667114,
"adv/mean_abs_reasoning": 0.4369834363460541,
"adv/mean_abs_step_conf": 0.7152504920959473,
"adv/ratio_final_to_reasoning": 1.5261727163923282,
"adv/ratio_step_to_reasoning": 1.6367908543094734,
"adv/std_final_conf": 0.8692089319229126,
"adv/std_reasoning": 0.7208417057991028,
"adv/std_step_conf": 0.9349717497825623,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6541086541086542,
"calib/avg_num_step_conf": 6.44140625,
"calib/ece": 0.2792307692307693,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.7368421052631579,
"calib/gap": 0.2747229047229047,
"calib/mean_conf": 0.7925910931174088,
"calib/mu_c": 0.9027027027027027,
"calib/mu_w": 0.627979797979798,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.23631578947368426,
"calib/std_conf": 0.37290489882058686,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4857142857142857,
"calib/step_q_c_n": 770.0,
"calib/step_q_gap": 0.1988883471477328,
"calib/step_q_w": 0.2868259385665529,
"calib/step_q_w_n": 879.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2450.0,
"completions/max_terminated_length": 2450.0,
"completions/mean_length": 494.52734375,
"completions/mean_terminated_length": 498.4212646484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 89.0,
"epoch": 0.1632,
"grad_norm": 0.03760277479887009,
"kl": 0.0871429443359375,
"learning_rate": 1.3055555555555556e-06,
"loss": 0.0606,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03430212289094925,
"mask/share_reasoning": 0.8316156268119812,
"mask/share_step_conf": 0.12626971304416656,
"num_tokens": 35313819.0,
"reward": 1.112259864807129,
"reward_std": 0.2520466148853302,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6784656047821045,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8270569443702698,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.6259250640869141,
"adv/mean_abs_reasoning": 0.4594433605670929,
"adv/mean_abs_step_conf": 0.7331865429878235,
"adv/ratio_final_to_reasoning": 1.3623552276701356,
"adv/ratio_step_to_reasoning": 1.5958148618860184,
"adv/std_final_conf": 0.861115038394928,
"adv/std_reasoning": 0.739260733127594,
"adv/std_step_conf": 0.9349847435951233,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.732061790668348,
"calib/avg_num_step_conf": 6.046875,
"calib/ece": 0.32396825396825396,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6865079365079365,
"calib/gap": 0.3342345523329129,
"calib/mean_conf": 0.7538888888888889,
"calib/mu_c": 0.9263114754098359,
"calib/mu_w": 0.5920769230769231,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.29686507936507933,
"calib/std_conf": 0.3991160316273202,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.48846938775510207,
"calib/step_q_c_n": 686.0,
"calib/step_q_gap": 0.1685157914673991,
"calib/step_q_w": 0.319953596287703,
"calib/step_q_w_n": 862.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2309.0,
"completions/max_terminated_length": 2309.0,
"completions/mean_length": 468.23046875,
"completions/mean_terminated_length": 468.23046875,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.05206868425011635,
"kl": 0.09896087646484375,
"learning_rate": 1.2777777777777779e-06,
"loss": 0.0221,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03563937172293663,
"mask/share_reasoning": 0.8281756043434143,
"mask/share_step_conf": 0.13618502020835876,
"num_tokens": 35538126.0,
"reward": 1.1311614513397217,
"reward_std": 0.24756911396980286,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6666179895401001,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8700532913208008,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.6815347671508789,
"adv/mean_abs_reasoning": 0.501927375793457,
"adv/mean_abs_step_conf": 0.7255311012268066,
"adv/ratio_final_to_reasoning": 1.3578354160768673,
"adv/ratio_step_to_reasoning": 1.445490196823539,
"adv/std_final_conf": 0.8606975078582764,
"adv/std_reasoning": 0.7576583027839661,
"adv/std_step_conf": 0.9355096817016602,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7602004294917681,
"calib/avg_num_step_conf": 6.38671875,
"calib/ece": 0.25439516129032247,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5483870967741935,
"calib/gap": 0.3882976508101777,
"calib/mean_conf": 0.6551209677419356,
"calib/mu_c": 0.8539669421487605,
"calib/mu_w": 0.46566929133858276,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.21080645161290312,
"calib/std_conf": 0.42929234893097773,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4197167138810199,
"calib/step_q_c_n": 706.0,
"calib/step_q_gap": 0.1270611702857562,
"calib/step_q_w": 0.2926555435952637,
"calib/step_q_w_n": 929.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3024.0,
"completions/max_terminated_length": 3024.0,
"completions/mean_length": 448.1640625,
"completions/mean_terminated_length": 449.9216003417969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 47.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.03472757712006569,
"kl": 0.105712890625,
"learning_rate": 1.25e-06,
"loss": 0.0211,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03683391213417053,
"mask/share_reasoning": 0.8154951333999634,
"mask/share_step_conf": 0.14376471936702728,
"num_tokens": 35760072.0,
"reward": 1.1191637516021729,
"reward_std": 0.2559158205986023,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.7044066190719604,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8309472799301147,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.565130889415741,
"adv/mean_abs_reasoning": 0.47859495878219604,
"adv/mean_abs_step_conf": 0.7466103434562683,
"adv/ratio_final_to_reasoning": 1.180812457477068,
"adv/ratio_step_to_reasoning": 1.5600046129947713,
"adv/std_final_conf": 0.796569287776947,
"adv/std_reasoning": 0.7205866575241089,
"adv/std_step_conf": 0.9340900182723999,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7442551246899073,
"calib/avg_num_step_conf": 6.62890625,
"calib/ece": 0.274417670682731,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6104417670682731,
"calib/gap": 0.31649040344692514,
"calib/mean_conf": 0.7129718875502009,
"calib/mu_c": 0.8540579710144928,
"calib/mu_w": 0.5375675675675676,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.21658634538152616,
"calib/std_conf": 0.40130710532457237,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4248878923766816,
"calib/step_q_c_n": 892.0,
"calib/step_q_gap": 0.07880093585494247,
"calib/step_q_w": 0.34608695652173915,
"calib/step_q_w_n": 805.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2155.0,
"completions/max_terminated_length": 2155.0,
"completions/mean_length": 489.3046875,
"completions/mean_terminated_length": 491.22357177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.1664,
"grad_norm": 0.03993004187941551,
"kl": 0.091796875,
"learning_rate": 1.2222222222222223e-06,
"loss": -0.117,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.035405173897743225,
"mask/share_reasoning": 0.8169474601745605,
"mask/share_step_conf": 0.14374110102653503,
"num_tokens": 35990094.0,
"reward": 1.1344704627990723,
"reward_std": 0.1839873492717743,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7003722190856934,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8441497683525085,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.6025352478027344,
"adv/mean_abs_reasoning": 0.5503590703010559,
"adv/mean_abs_step_conf": 0.7544006109237671,
"adv/ratio_final_to_reasoning": 1.0948038840772392,
"adv/ratio_step_to_reasoning": 1.3707425781337572,
"adv/std_final_conf": 0.8273239731788635,
"adv/std_reasoning": 0.7928828597068787,
"adv/std_step_conf": 0.934908926486969,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8107236842105262,
"calib/avg_num_step_conf": 6.55078125,
"calib/ece": 0.20416666666666664,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6507936507936508,
"calib/gap": 0.4628789473684208,
"calib/mean_conf": 0.7078968253968254,
"calib/mu_c": 0.8915789473684209,
"calib/mu_w": 0.42870000000000014,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.1544444444444444,
"calib/std_conf": 0.4203399291307785,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.46174124513618675,
"calib/step_q_c_n": 1028.0,
"calib/step_q_gap": 0.12420657641507737,
"calib/step_q_w": 0.3375346687211094,
"calib/step_q_w_n": 649.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1280.0,
"completions/max_terminated_length": 1280.0,
"completions/mean_length": 470.54296875,
"completions/mean_terminated_length": 472.3882751464844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.03731679916381836,
"kl": 0.08966064453125,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.0978,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.035209380090236664,
"mask/share_reasoning": 0.8131346702575684,
"mask/share_step_conf": 0.147749662399292,
"num_tokens": 36214281.0,
"reward": 1.1939563751220703,
"reward_std": 0.21433734893798828,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.778255820274353,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8632087111473083,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.6070557832717896,
"adv/mean_abs_reasoning": 0.530916154384613,
"adv/mean_abs_step_conf": 0.7467933893203735,
"adv/ratio_final_to_reasoning": 1.1434117765269927,
"adv/ratio_step_to_reasoning": 1.4066126697274537,
"adv/std_final_conf": 0.8306695818901062,
"adv/std_reasoning": 0.7928876280784607,
"adv/std_step_conf": 0.9353790283203125,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6861399397388684,
"calib/avg_num_step_conf": 7.01953125,
"calib/ece": 0.2908064516129033,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6774193548387096,
"calib/gap": 0.2564847673250755,
"calib/mean_conf": 0.7820967741935483,
"calib/mu_c": 0.8886206896551726,
"calib/mu_w": 0.6321359223300971,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2441129032258065,
"calib/std_conf": 0.3714917057068018,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.44382448537378116,
"calib/step_q_c_n": 923.0,
"calib/step_q_gap": 0.12091830688407862,
"calib/step_q_w": 0.32290617848970254,
"calib/step_q_w_n": 874.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2531.0,
"completions/max_terminated_length": 2531.0,
"completions/mean_length": 504.9140625,
"completions/mean_terminated_length": 506.8941345214844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.039824292063713074,
"kl": 0.08563995361328125,
"learning_rate": 1.1666666666666668e-06,
"loss": 0.0819,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03828030824661255,
"mask/share_reasoning": 0.8063184022903442,
"mask/share_step_conf": 0.15149502456188202,
"num_tokens": 36448779.0,
"reward": 1.105135202407837,
"reward_std": 0.25119927525520325,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6788246035575867,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.816797137260437,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.658309817314148,
"adv/mean_abs_reasoning": 0.5221865177154541,
"adv/mean_abs_step_conf": 0.7737743258476257,
"adv/ratio_final_to_reasoning": 1.2606794602706866,
"adv/ratio_step_to_reasoning": 1.4817968285218444,
"adv/std_final_conf": 0.85653156042099,
"adv/std_reasoning": 0.7753108143806458,
"adv/std_step_conf": 0.9341161251068115,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7589743589743589,
"calib/avg_num_step_conf": 6.56640625,
"calib/ece": 0.26162698412698404,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6468253968253969,
"calib/gap": 0.3909002849002847,
"calib/mean_conf": 0.7204365079365079,
"calib/mu_c": 0.9019259259259259,
"calib/mu_w": 0.5110256410256412,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22317460317460308,
"calib/std_conf": 0.4106588395209512,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4664348925410872,
"calib/step_q_c_n": 791.0,
"calib/step_q_gap": 0.16261466782198614,
"calib/step_q_w": 0.3038202247191011,
"calib/step_q_w_n": 890.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1727.0,
"completions/max_terminated_length": 1727.0,
"completions/mean_length": 442.78125,
"completions/mean_terminated_length": 448.0316467285156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.1696,
"grad_norm": 0.05413948372006416,
"kl": 0.10001373291015625,
"learning_rate": 1.138888888888889e-06,
"loss": -0.1049,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.036744873970746994,
"mask/share_reasoning": 0.8111945390701294,
"mask/share_step_conf": 0.1403418779373169,
"num_tokens": 36666915.0,
"reward": 1.1794384717941284,
"reward_std": 0.21837085485458374,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7313566207885742,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.883450984954834,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.6163524985313416,
"adv/mean_abs_reasoning": 0.5648249983787537,
"adv/mean_abs_step_conf": 0.7450392246246338,
"adv/ratio_final_to_reasoning": 1.0912273718417032,
"adv/ratio_step_to_reasoning": 1.3190620577402883,
"adv/std_final_conf": 0.8117612600326538,
"adv/std_reasoning": 0.7930551767349243,
"adv/std_step_conf": 0.935079038143158,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.7678719008264463,
"calib/avg_num_step_conf": 6.953125,
"calib/ece": 0.2767768595041321,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.5826446280991735,
"calib/gap": 0.35518181818181827,
"calib/mean_conf": 0.6590082644628099,
"calib/mu_c": 0.8204545454545455,
"calib/mu_w": 0.4652727272727273,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.19516528925619825,
"calib/std_conf": 0.4414512223902891,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.42633136094674556,
"calib/step_q_c_n": 676.0,
"calib/step_q_gap": 0.18621360732355713,
"calib/step_q_w": 0.24011775362318843,
"calib/step_q_w_n": 1104.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3013.0,
"completions/max_terminated_length": 3013.0,
"completions/mean_length": 495.39453125,
"completions/mean_terminated_length": 505.2629699707031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 72.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.026633771136403084,
"kl": 0.09081268310546875,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0281,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.03583589568734169,
"mask/share_reasoning": 0.8143898844718933,
"mask/share_step_conf": 0.1302429735660553,
"num_tokens": 36898576.0,
"reward": 1.1015212535858154,
"reward_std": 0.26079750061035156,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6732444763183594,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.8261153697967529,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.5825837254524231,
"adv/mean_abs_reasoning": 0.42827603220939636,
"adv/mean_abs_step_conf": 0.7349774241447449,
"adv/ratio_final_to_reasoning": 1.3602996236959188,
"adv/ratio_step_to_reasoning": 1.7161301797654496,
"adv/std_final_conf": 0.8111425042152405,
"adv/std_reasoning": 0.7205584645271301,
"adv/std_step_conf": 0.9347352385520935,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6690058479532164,
"calib/avg_num_step_conf": 6.078125,
"calib/ece": 0.28840597609561747,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6175298804780877,
"calib/gap": 0.24872061403508772,
"calib/mean_conf": 0.6920721115537849,
"calib/mu_c": 0.7713456140350877,
"calib/mu_w": 0.522625,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14960159362549796,
"calib/std_conf": 0.42532849411659873,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.42097,
"calib/step_q_c_n": 1000.0,
"calib/step_q_gap": 0.1140275539568345,
"calib/step_q_w": 0.3069424460431655,
"calib/step_q_w_n": 556.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2579.0,
"completions/max_terminated_length": 2579.0,
"completions/mean_length": 450.99609375,
"completions/mean_terminated_length": 452.7647399902344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.04227954521775246,
"kl": 0.10567474365234375,
"learning_rate": 1.0833333333333335e-06,
"loss": 0.0908,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03879394754767418,
"mask/share_reasoning": 0.8145664930343628,
"mask/share_step_conf": 0.14273332059383392,
"num_tokens": 37117951.0,
"reward": 1.1423712968826294,
"reward_std": 0.18462207913398743,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.6920831799507141,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8419811725616455,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.5636827945709229,
"adv/mean_abs_reasoning": 0.44984203577041626,
"adv/mean_abs_step_conf": 0.7485389709472656,
"adv/ratio_final_to_reasoning": 1.2530682989764144,
"adv/ratio_step_to_reasoning": 1.6640040534791058,
"adv/std_final_conf": 0.7876395583152771,
"adv/std_reasoning": 0.7014114260673523,
"adv/std_step_conf": 0.9347339272499084,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7785826021900293,
"calib/avg_num_step_conf": 5.5,
"calib/ece": 0.19414062499999996,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.64453125,
"calib/gap": 0.45804121607835147,
"calib/mean_conf": 0.706171875,
"calib/mu_c": 0.8618343195266273,
"calib/mu_w": 0.4037931034482758,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.12007812499999997,
"calib/std_conf": 0.4198755788432859,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4280067567567568,
"calib/step_q_c_n": 888.0,
"calib/step_q_gap": 0.08075675675675675,
"calib/step_q_w": 0.34725000000000006,
"calib/step_q_w_n": 520.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1186.0,
"completions/max_terminated_length": 1186.0,
"completions/mean_length": 413.7421875,
"completions/mean_terminated_length": 415.3647155761719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.1728,
"grad_norm": 0.047483619302511215,
"kl": 0.104461669921875,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.0835,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03956054151058197,
"mask/share_reasoning": 0.8172359466552734,
"mask/share_step_conf": 0.1392972469329834,
"num_tokens": 37328013.0,
"reward": 1.216137409210205,
"reward_std": 0.18878695368766785,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7969167828559875,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8699263334274292,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.5739138126373291,
"adv/mean_abs_reasoning": 0.443822979927063,
"adv/mean_abs_step_conf": 0.7315582633018494,
"adv/ratio_final_to_reasoning": 1.2931142338137718,
"adv/ratio_step_to_reasoning": 1.648310917614208,
"adv/std_final_conf": 0.8101106286048889,
"adv/std_reasoning": 0.7206270098686218,
"adv/std_step_conf": 0.9346047639846802,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8156389065479974,
"calib/avg_num_step_conf": 6.01953125,
"calib/ece": 0.22665338645418318,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.49800796812749004,
"calib/gap": 0.46431722822631916,
"calib/mean_conf": 0.5770119521912351,
"calib/mu_c": 0.8008461538461539,
"calib/mu_w": 0.3365289256198347,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14286852589641424,
"calib/std_conf": 0.45616388927780055,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.43541944074567246,
"calib/step_q_c_n": 751.0,
"calib/step_q_gap": 0.16036880783428004,
"calib/step_q_w": 0.2750506329113924,
"calib/step_q_w_n": 790.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2386.0,
"completions/max_terminated_length": 2386.0,
"completions/mean_length": 469.5625,
"completions/mean_terminated_length": 469.5625,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.03634432330727577,
"kl": 0.09714508056640625,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.0162,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.037918318063020706,
"mask/share_reasoning": 0.8201773166656494,
"mask/share_step_conf": 0.14190436899662018,
"num_tokens": 37553053.0,
"reward": 1.1781021356582642,
"reward_std": 0.1947747766971588,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7529621124267578,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8709113001823425,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.6148437261581421,
"adv/mean_abs_reasoning": 0.50270015001297,
"adv/mean_abs_step_conf": 0.7660697102546692,
"adv/ratio_final_to_reasoning": 1.2230824401828382,
"adv/ratio_step_to_reasoning": 1.5239098501062793,
"adv/std_final_conf": 0.8307430148124695,
"adv/std_reasoning": 0.7575799822807312,
"adv/std_step_conf": 0.9345313906669617,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8027272727272727,
"calib/avg_num_step_conf": 6.6953125,
"calib/ece": 0.23243999999999998,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.444,
"calib/gap": 0.4427727272727273,
"calib/mean_conf": 0.5366799999999999,
"calib/mu_c": 0.7315,
"calib/mu_w": 0.2887272727272727,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10455999999999999,
"calib/std_conf": 0.45835420539141997,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41191158900836317,
"calib/step_q_c_n": 837.0,
"calib/step_q_gap": 0.15996176004599144,
"calib/step_q_w": 0.2519498289623717,
"calib/step_q_w_n": 877.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2870.0,
"completions/max_terminated_length": 2870.0,
"completions/mean_length": 526.41015625,
"completions/mean_terminated_length": 526.41015625,
"completions/min_length": 123.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.035710543394088745,
"kl": 0.0910186767578125,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0663,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.032561205327510834,
"mask/share_reasoning": 0.8339776992797852,
"mask/share_step_conf": 0.1334611475467682,
"num_tokens": 37793950.0,
"reward": 1.172410488128662,
"reward_std": 0.2033061385154724,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7394199371337891,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8676632642745972,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.5234141945838928,
"adv/mean_abs_reasoning": 0.373664915561676,
"adv/mean_abs_step_conf": 0.7581987380981445,
"adv/ratio_final_to_reasoning": 1.400758200156738,
"adv/ratio_step_to_reasoning": 2.0290873093034567,
"adv/std_final_conf": 0.7647106051445007,
"adv/std_reasoning": 0.6814833879470825,
"adv/std_step_conf": 0.9337098598480225,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7628519356460532,
"calib/avg_num_step_conf": 6.0,
"calib/ece": 0.231501976284585,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5810276679841897,
"calib/gap": 0.48271744595274013,
"calib/mean_conf": 0.615296442687747,
"calib/mu_c": 0.838529411764706,
"calib/mu_w": 0.35581196581196584,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1546245059288538,
"calib/std_conf": 0.465140136845168,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.40995,
"calib/step_q_c_n": 800.0,
"calib/step_q_gap": 0.13495000000000001,
"calib/step_q_w": 0.27499999999999997,
"calib/step_q_w_n": 736.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1260.0,
"completions/max_terminated_length": 1260.0,
"completions/mean_length": 467.9765625,
"completions/mean_terminated_length": 469.8117980957031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.176,
"grad_norm": 0.033411670476198196,
"kl": 0.093292236328125,
"learning_rate": 9.722222222222224e-07,
"loss": -0.1565,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.036157675087451935,
"mask/share_reasoning": 0.8241069316864014,
"mask/share_step_conf": 0.1358291506767273,
"num_tokens": 38019328.0,
"reward": 1.1931571960449219,
"reward_std": 0.1731734275817871,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7560117244720459,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8853057622909546,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.5332635641098022,
"adv/mean_abs_reasoning": 0.46829888224601746,
"adv/mean_abs_step_conf": 0.752028226852417,
"adv/ratio_final_to_reasoning": 1.1387248279393845,
"adv/ratio_step_to_reasoning": 1.605872350678267,
"adv/std_final_conf": 0.7756200432777405,
"adv/std_reasoning": 0.7206501364707947,
"adv/std_step_conf": 0.9337971210479736,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8616442085204845,
"calib/avg_num_step_conf": 6.8515625,
"calib/ece": 0.1613545816733068,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5059760956175299,
"calib/gap": 0.6092731727235605,
"calib/mean_conf": 0.5751394422310756,
"calib/mu_c": 0.800886075949367,
"calib/mu_w": 0.19161290322580646,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.05350597609561754,
"calib/std_conf": 0.46354974599286264,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.42130822596630324,
"calib/step_q_c_n": 1009.0,
"calib/step_q_gap": 0.14699950113408844,
"calib/step_q_w": 0.2743087248322148,
"calib/step_q_w_n": 745.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2288.0,
"completions/max_terminated_length": 2288.0,
"completions/mean_length": 504.9765625,
"completions/mean_terminated_length": 504.9765625,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.05789630860090256,
"kl": 0.08490753173828125,
"learning_rate": 9.444444444444445e-07,
"loss": -0.0184,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03413501754403114,
"mask/share_reasoning": 0.8212205171585083,
"mask/share_step_conf": 0.14464449882507324,
"num_tokens": 38254786.0,
"reward": 1.2201125621795654,
"reward_std": 0.18350180983543396,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.8095582127571106,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8751322031021118,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.6072766780853271,
"adv/mean_abs_reasoning": 0.45788443088531494,
"adv/mean_abs_step_conf": 0.7228624224662781,
"adv/ratio_final_to_reasoning": 1.3262662740271902,
"adv/ratio_step_to_reasoning": 1.57870059278633,
"adv/std_final_conf": 0.833148181438446,
"adv/std_reasoning": 0.7394189238548279,
"adv/std_step_conf": 0.9349060654640198,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7140316492450639,
"calib/avg_num_step_conf": 5.984375,
"calib/ece": 0.19868000000000008,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.696,
"calib/gap": 0.38988966318234597,
"calib/mean_conf": 0.7716400000000001,
"calib/mu_c": 0.8995238095238095,
"calib/mu_w": 0.5096341463414635,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.14916000000000007,
"calib/std_conf": 0.3831429372962524,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.43428714859437745,
"calib/step_q_c_n": 996.0,
"calib/step_q_gap": 0.048335656057064025,
"calib/step_q_w": 0.3859514925373134,
"calib/step_q_w_n": 536.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2219.0,
"completions/max_terminated_length": 2219.0,
"completions/mean_length": 453.8203125,
"completions/mean_terminated_length": 453.8203125,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.05314803496003151,
"kl": 0.09221649169921875,
"learning_rate": 9.166666666666666e-07,
"loss": -0.0162,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03694336861371994,
"mask/share_reasoning": 0.8237749934196472,
"mask/share_step_conf": 0.13928166031837463,
"num_tokens": 38476572.0,
"reward": 1.1797784566879272,
"reward_std": 0.2056841403245926,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.772200345993042,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8410501480102539,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.5778178572654724,
"adv/mean_abs_reasoning": 0.5160121321678162,
"adv/mean_abs_step_conf": 0.745305597782135,
"adv/ratio_final_to_reasoning": 1.1197757208498267,
"adv/ratio_step_to_reasoning": 1.4443567337284011,
"adv/std_final_conf": 0.7986822128295898,
"adv/std_reasoning": 0.7754129767417908,
"adv/std_step_conf": 0.934490442276001,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.8028047091412743,
"calib/avg_num_step_conf": 7.19921875,
"calib/ece": 0.20668016194331984,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5384615384615384,
"calib/gap": 0.45569736842105274,
"calib/mean_conf": 0.6240080971659919,
"calib/mu_c": 0.7992763157894738,
"calib/mu_w": 0.34357894736842104,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.10765182186234816,
"calib/std_conf": 0.44303851462430305,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4125210970464135,
"calib/step_q_c_n": 948.0,
"calib/step_q_gap": 0.16366076185088277,
"calib/step_q_w": 0.24886033519553072,
"calib/step_q_w_n": 895.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2641.0,
"completions/max_terminated_length": 2641.0,
"completions/mean_length": 510.34765625,
"completions/mean_terminated_length": 518.4484252929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.1792,
"grad_norm": 0.03475351259112358,
"kl": 0.11714935302734375,
"learning_rate": 8.88888888888889e-07,
"loss": -0.0278,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03463764488697052,
"mask/share_reasoning": 0.8162198066711426,
"mask/share_step_conf": 0.13351748883724213,
"num_tokens": 38711893.0,
"reward": 1.1767488718032837,
"reward_std": 0.2032414674758911,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.755155086517334,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8582700490951538,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.6278563737869263,
"adv/mean_abs_reasoning": 0.45637246966362,
"adv/mean_abs_step_conf": 0.7611607909202576,
"adv/ratio_final_to_reasoning": 1.3757542698614194,
"adv/ratio_step_to_reasoning": 1.6678499285491277,
"adv/std_final_conf": 0.8417154550552368,
"adv/std_reasoning": 0.701534628868103,
"adv/std_step_conf": 0.9345760345458984,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7518236074270557,
"calib/avg_num_step_conf": 5.64453125,
"calib/ece": 0.2073092369477912,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5542168674698795,
"calib/gap": 0.4210716180371354,
"calib/mean_conf": 0.6375100401606426,
"calib/mu_c": 0.8133793103448277,
"calib/mu_w": 0.3923076923076923,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.13124497991967873,
"calib/std_conf": 0.43499183496422156,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.46704738760631836,
"calib/step_q_c_n": 823.0,
"calib/step_q_gap": 0.16497343262239555,
"calib/step_q_w": 0.3020739549839228,
"calib/step_q_w_n": 622.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2524.0,
"completions/max_terminated_length": 2524.0,
"completions/mean_length": 458.37890625,
"completions/mean_terminated_length": 461.9881896972656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.04523325711488724,
"kl": 0.09043121337890625,
"learning_rate": 8.611111111111112e-07,
"loss": -0.0443,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03630748391151428,
"mask/share_reasoning": 0.8291934728622437,
"mask/share_step_conf": 0.1266864836215973,
"num_tokens": 38933422.0,
"reward": 1.1640233993530273,
"reward_std": 0.22012245655059814,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7365875244140625,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.856806218624115,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.6055898666381836,
"adv/mean_abs_reasoning": 0.4949433207511902,
"adv/mean_abs_step_conf": 0.749203085899353,
"adv/ratio_final_to_reasoning": 1.2235539732490215,
"adv/ratio_step_to_reasoning": 1.5137149133809205,
"adv/std_final_conf": 0.8305554986000061,
"adv/std_reasoning": 0.7575010657310486,
"adv/std_step_conf": 0.9346402287483215,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8241965369624944,
"calib/avg_num_step_conf": 6.16015625,
"calib/ece": 0.20337301587301593,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5595238095238095,
"calib/gap": 0.47727237876174045,
"calib/mean_conf": 0.643531746031746,
"calib/mu_c": 0.853758865248227,
"calib/mu_w": 0.3764864864864865,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.14369047619047623,
"calib/std_conf": 0.44279844543302516,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4413757225433526,
"calib/step_q_c_n": 865.0,
"calib/step_q_gap": 0.14932516074559982,
"calib/step_q_w": 0.2920505617977528,
"calib/step_q_w_n": 712.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2439.0,
"completions/max_terminated_length": 2439.0,
"completions/mean_length": 480.2578125,
"completions/mean_terminated_length": 482.1412048339844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.05760641396045685,
"kl": 0.13443756103515625,
"learning_rate": 8.333333333333333e-07,
"loss": 0.025,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.034499507397413254,
"mask/share_reasoning": 0.827689528465271,
"mask/share_step_conf": 0.13390469551086426,
"num_tokens": 39160520.0,
"reward": 1.1697800159454346,
"reward_std": 0.209752157330513,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7549155950546265,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8543462753295898,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.6666871309280396,
"adv/mean_abs_reasoning": 0.45934537053108215,
"adv/mean_abs_step_conf": 0.7705562114715576,
"adv/ratio_final_to_reasoning": 1.4513853272478499,
"adv/ratio_step_to_reasoning": 1.67750947523573,
"adv/std_final_conf": 0.8589778542518616,
"adv/std_reasoning": 0.701428234577179,
"adv/std_step_conf": 0.9345902800559998,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7058211382113821,
"calib/avg_num_step_conf": 6.2265625,
"calib/ece": 0.3018145161290323,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5040322580645161,
"calib/gap": 0.3157652032520326,
"calib/mean_conf": 0.5811693548387098,
"calib/mu_c": 0.7403252032520325,
"calib/mu_w": 0.42455999999999994,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19350806451612912,
"calib/std_conf": 0.45758641386917104,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4311796246648793,
"calib/step_q_c_n": 746.0,
"calib/step_q_gap": 0.13754754919318118,
"calib/step_q_w": 0.2936320754716981,
"calib/step_q_w_n": 848.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2701.0,
"completions/max_terminated_length": 2701.0,
"completions/mean_length": 469.86328125,
"completions/mean_terminated_length": 473.56298828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.1824,
"grad_norm": 0.04498155787587166,
"kl": 0.0904083251953125,
"learning_rate": 8.055555555555557e-07,
"loss": -0.0194,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.036814551800489426,
"mask/share_reasoning": 0.820866584777832,
"mask/share_step_conf": 0.13450628519058228,
"num_tokens": 39387701.0,
"reward": 1.1181427240371704,
"reward_std": 0.21004648506641388,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.6696425676345825,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8511993288993835,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.6099177002906799,
"adv/mean_abs_reasoning": 0.5050039291381836,
"adv/mean_abs_step_conf": 0.7756307721138,
"adv/ratio_final_to_reasoning": 1.2077484255053963,
"adv/ratio_step_to_reasoning": 1.5358905690841964,
"adv/std_final_conf": 0.8136227130889893,
"adv/std_reasoning": 0.7576004862785339,
"adv/std_step_conf": 0.9339342713356018,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7734019886363637,
"calib/avg_num_step_conf": 6.08984375,
"calib/ece": 0.21342741935483872,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6451612903225806,
"calib/gap": 0.29330681818181825,
"calib/mean_conf": 0.7747983870967742,
"calib/mu_c": 0.8788750000000001,
"calib/mu_w": 0.5855681818181818,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17153225806451616,
"calib/std_conf": 0.3564655681657362,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.44785467128027684,
"calib/step_q_c_n": 867.0,
"calib/step_q_gap": 0.11626507590455432,
"calib/step_q_w": 0.3315895953757225,
"calib/step_q_w_n": 692.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2520.0,
"completions/max_terminated_length": 2520.0,
"completions/mean_length": 446.390625,
"completions/mean_terminated_length": 449.905517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 112.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.04795660451054573,
"kl": 0.08995819091796875,
"learning_rate": 7.777777777777779e-07,
"loss": -0.0394,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03795355185866356,
"mask/share_reasoning": 0.8114452362060547,
"mask/share_step_conf": 0.14278870820999146,
"num_tokens": 39605329.0,
"reward": 1.1778701543807983,
"reward_std": 0.20241671800613403,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7376941442489624,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8661972880363464,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.622528612613678,
"adv/mean_abs_reasoning": 0.5239760279655457,
"adv/mean_abs_step_conf": 0.7593715190887451,
"adv/ratio_final_to_reasoning": 1.188086056209069,
"adv/ratio_step_to_reasoning": 1.4492485887897872,
"adv/std_final_conf": 0.8453269004821777,
"adv/std_reasoning": 0.7753785848617554,
"adv/std_step_conf": 0.9348943829536438,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7443,
"calib/avg_num_step_conf": 5.90625,
"calib/ece": 0.26796000000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.7,
"calib/gap": 0.28856666666666675,
"calib/mean_conf": 0.78724,
"calib/mu_c": 0.9026666666666667,
"calib/mu_w": 0.6141,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.22760000000000005,
"calib/std_conf": 0.36290381976496194,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4717543859649123,
"calib/step_q_c_n": 912.0,
"calib/step_q_gap": 0.08097105263157905,
"calib/step_q_w": 0.39078333333333326,
"calib/step_q_w_n": 600.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2472.0,
"completions/max_terminated_length": 2472.0,
"completions/mean_length": 491.66015625,
"completions/mean_terminated_length": 491.66015625,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.03766198456287384,
"kl": 0.08562469482421875,
"learning_rate": 7.5e-07,
"loss": -0.0357,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03844533860683441,
"mask/share_reasoning": 0.8244307041168213,
"mask/share_step_conf": 0.1371239721775055,
"num_tokens": 39834354.0,
"reward": 1.1404576301574707,
"reward_std": 0.22183360159397125,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7107362747192383,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.838973343372345,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.657545804977417,
"adv/mean_abs_reasoning": 0.57723069190979,
"adv/mean_abs_step_conf": 0.7305585741996765,
"adv/ratio_final_to_reasoning": 1.1391386740055371,
"adv/ratio_step_to_reasoning": 1.2656266973306551,
"adv/std_final_conf": 0.844294011592865,
"adv/std_reasoning": 0.7929427623748779,
"adv/std_step_conf": 0.9203072190284729,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6821343402225755,
"calib/avg_num_step_conf": 6.1015625,
"calib/ece": 0.31064777327935217,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.5384615384615384,
"calib/gap": 0.30688857975622674,
"calib/mean_conf": 0.6048987854251012,
"calib/mu_c": 0.7738738738738739,
"calib/mu_w": 0.4669852941176471,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.23307692307692301,
"calib/std_conf": 0.4544435244132277,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.40302702702702703,
"calib/step_q_c_n": 740.0,
"calib/step_q_gap": 0.04370829223383965,
"calib/step_q_w": 0.3593187347931874,
"calib/step_q_w_n": 822.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2603.0,
"completions/max_terminated_length": 2603.0,
"completions/mean_length": 520.83984375,
"completions/mean_terminated_length": 520.83984375,
"completions/min_length": 131.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.1856,
"grad_norm": 0.03625209629535675,
"kl": 0.0858917236328125,
"learning_rate": 7.222222222222222e-07,
"loss": -0.0454,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03234461694955826,
"mask/share_reasoning": 0.838646650314331,
"mask/share_step_conf": 0.1290086805820465,
"num_tokens": 40071921.0,
"reward": 1.0493491888046265,
"reward_std": 0.23219379782676697,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.6305624842643738,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.7949029803276062,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.727577805519104,
"adv/mean_abs_reasoning": 0.4823678731918335,
"adv/mean_abs_step_conf": 0.7414121627807617,
"adv/ratio_final_to_reasoning": 1.5083463181424452,
"adv/ratio_step_to_reasoning": 1.5370264148705206,
"adv/std_final_conf": 0.8904538154602051,
"adv/std_reasoning": 0.7394604682922363,
"adv/std_step_conf": 0.9354656338691711,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7771801250588315,
"calib/avg_num_step_conf": 6.73046875,
"calib/ece": 0.24191056910569106,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.4186991869918699,
"calib/gap": 0.4366859409668526,
"calib/mean_conf": 0.5297967479674797,
"calib/mu_c": 0.7765420560747663,
"calib/mu_w": 0.33985611510791375,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.16837398373983742,
"calib/std_conf": 0.4564305209746925,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.4754432624113476,
"calib/step_q_c_n": 564.0,
"calib/step_q_gap": 0.20732419424913878,
"calib/step_q_w": 0.2681190681622088,
"calib/step_q_w_n": 1159.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2853.0,
"completions/max_terminated_length": 2853.0,
"completions/mean_length": 517.40625,
"completions/mean_terminated_length": 523.54150390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.05229157581925392,
"kl": 0.08309173583984375,
"learning_rate": 6.944444444444446e-07,
"loss": -0.1045,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03500085324048996,
"mask/share_reasoning": 0.8173953294754028,
"mask/share_step_conf": 0.1358851194381714,
"num_tokens": 40310201.0,
"reward": 1.1124995946884155,
"reward_std": 0.2691600024700165,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.7121738195419312,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.826258659362793,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.6193594932556152,
"adv/mean_abs_reasoning": 0.5067633390426636,
"adv/mean_abs_step_conf": 0.7527080178260803,
"adv/ratio_final_to_reasoning": 1.2221868583186368,
"adv/ratio_step_to_reasoning": 1.4853245288975236,
"adv/std_final_conf": 0.8128105998039246,
"adv/std_reasoning": 0.7575881481170654,
"adv/std_step_conf": 0.9347853064537048,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8053527980535279,
"calib/avg_num_step_conf": 6.37109375,
"calib/ece": 0.21798387096774188,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.5766129032258065,
"calib/gap": 0.4824081015321892,
"calib/mean_conf": 0.6607258064516128,
"calib/mu_c": 0.8766423357664235,
"calib/mu_w": 0.39423423423423426,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.16314516129032253,
"calib/std_conf": 0.4443692038542129,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4625,
"calib/step_q_c_n": 820.0,
"calib/step_q_gap": 0.1858415536374846,
"calib/step_q_w": 0.2766584463625154,
"calib/step_q_w_n": 811.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2362.0,
"completions/max_terminated_length": 2362.0,
"completions/mean_length": 475.4765625,
"completions/mean_terminated_length": 477.3412170410156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.04875698313117027,
"kl": 0.08687591552734375,
"learning_rate": 6.666666666666667e-07,
"loss": -0.054,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03778859227895737,
"mask/share_reasoning": 0.8117580413818359,
"mask/share_step_conf": 0.14654704928398132,
"num_tokens": 40535987.0,
"reward": 1.163760781288147,
"reward_std": 0.2515183985233307,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7497754096984863,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8528724312782288,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.5847810506820679,
"adv/mean_abs_reasoning": 0.4223461151123047,
"adv/mean_abs_step_conf": 0.7501903772354126,
"adv/ratio_final_to_reasoning": 1.38460146727423,
"adv/ratio_step_to_reasoning": 1.7762454782753996,
"adv/std_final_conf": 0.788453996181488,
"adv/std_reasoning": 0.7013620138168335,
"adv/std_step_conf": 0.9340070486068726,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7479757085020242,
"calib/avg_num_step_conf": 6.36328125,
"calib/ece": 0.23091633466135472,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.49800796812749004,
"calib/gap": 0.4112827260458839,
"calib/mean_conf": 0.5870916334661355,
"calib/mu_c": 0.7427564102564103,
"calib/mu_w": 0.33147368421052636,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.09824701195219138,
"calib/std_conf": 0.45215102055416,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4445251396648045,
"calib/step_q_c_n": 895.0,
"calib/step_q_gap": 0.16945701977379635,
"calib/step_q_w": 0.27506811989100816,
"calib/step_q_w_n": 734.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2808.0,
"completions/max_terminated_length": 2808.0,
"completions/mean_length": 506.98828125,
"completions/mean_terminated_length": 506.98828125,
"completions/min_length": 133.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.1888,
"grad_norm": 0.048805780708789825,
"kl": 0.08119964599609375,
"learning_rate": 6.388888888888889e-07,
"loss": 0.1032,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034971535205841064,
"mask/share_reasoning": 0.8307249546051025,
"mask/share_step_conf": 0.1343035101890564,
"num_tokens": 40769608.0,
"reward": 1.1738266944885254,
"reward_std": 0.19798734784126282,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7344551086425781,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.864007294178009,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.6846010684967041,
"adv/mean_abs_reasoning": 0.5379370450973511,
"adv/mean_abs_step_conf": 0.7198708057403564,
"adv/ratio_final_to_reasoning": 1.2726416125009778,
"adv/ratio_step_to_reasoning": 1.3382064170912056,
"adv/std_final_conf": 0.8916929364204407,
"adv/std_reasoning": 0.7754216194152832,
"adv/std_step_conf": 0.9352347254753113,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8307109557109558,
"calib/avg_num_step_conf": 6.1796875,
"calib/ece": 0.1717928286852589,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5418326693227091,
"calib/gap": 0.5239355089355089,
"calib/mean_conf": 0.6412749003984064,
"calib/mu_c": 0.8667132867132866,
"calib/mu_w": 0.34277777777777774,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.12167330677290833,
"calib/std_conf": 0.43307660608780046,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4629019607843137,
"calib/step_q_c_n": 765.0,
"calib/step_q_gap": 0.1892422300621595,
"calib/step_q_w": 0.2736597307221542,
"calib/step_q_w_n": 817.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2627.0,
"completions/max_terminated_length": 2627.0,
"completions/mean_length": 456.59765625,
"completions/mean_terminated_length": 458.3882751464844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.044407956302165985,
"kl": 0.09165191650390625,
"learning_rate": 6.111111111111112e-07,
"loss": 0.0158,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.038511939346790314,
"mask/share_reasoning": 0.8155031204223633,
"mask/share_step_conf": 0.142078697681427,
"num_tokens": 40992569.0,
"reward": 1.1956806182861328,
"reward_std": 0.2603704333305359,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.791344165802002,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8625114560127258,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.6494489312171936,
"adv/mean_abs_reasoning": 0.5641993284225464,
"adv/mean_abs_step_conf": 0.7575774192810059,
"adv/ratio_final_to_reasoning": 1.15109837693887,
"adv/ratio_step_to_reasoning": 1.3427478217656308,
"adv/std_final_conf": 0.8725244402885437,
"adv/std_reasoning": 0.8097829818725586,
"adv/std_step_conf": 0.9347437620162964,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7800624501727346,
"calib/avg_num_step_conf": 5.79296875,
"calib/ece": 0.22362903225806446,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6048387096774194,
"calib/gap": 0.4450199309061919,
"calib/mean_conf": 0.6555645161290322,
"calib/mu_c": 0.845774647887324,
"calib/mu_w": 0.40075471698113213,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.15330645161290318,
"calib/std_conf": 0.4468416086903993,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4719305019305019,
"calib/step_q_c_n": 777.0,
"calib/step_q_gap": 0.14313446793616763,
"calib/step_q_w": 0.32879603399433427,
"calib/step_q_w_n": 706.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2990.0,
"completions/max_terminated_length": 2990.0,
"completions/mean_length": 466.59375,
"completions/mean_terminated_length": 466.59375,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.047058381140232086,
"kl": 0.08917236328125,
"learning_rate": 5.833333333333334e-07,
"loss": -0.0442,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03583589196205139,
"mask/share_reasoning": 0.8318737745285034,
"mask/share_step_conf": 0.13229036331176758,
"num_tokens": 41218281.0,
"reward": 1.1654024124145508,
"reward_std": 0.24288588762283325,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7387410402297974,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.858771562576294,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.5855597853660583,
"adv/mean_abs_reasoning": 0.36934980750083923,
"adv/mean_abs_step_conf": 0.7651547193527222,
"adv/ratio_final_to_reasoning": 1.5853799662931403,
"adv/ratio_step_to_reasoning": 2.0716261490158856,
"adv/std_final_conf": 0.7945824861526489,
"adv/std_reasoning": 0.640407383441925,
"adv/std_step_conf": 0.934300422668457,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7083667200854701,
"calib/avg_num_step_conf": 6.68359375,
"calib/ece": 0.2598412698412698,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5912698412698413,
"calib/gap": 0.3162339743589744,
"calib/mean_conf": 0.6772222222222222,
"calib/mu_c": 0.7976923076923077,
"calib/mu_w": 0.4814583333333333,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.15900793650793651,
"calib/std_conf": 0.4247971297112714,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4480020181634713,
"calib/step_q_c_n": 991.0,
"calib/step_q_gap": 0.12418257371902686,
"calib/step_q_w": 0.32381944444444444,
"calib/step_q_w_n": 720.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2554.0,
"completions/max_terminated_length": 2554.0,
"completions/mean_length": 540.7421875,
"completions/mean_terminated_length": 540.7421875,
"completions/min_length": 115.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.192,
"grad_norm": 0.04770103842020035,
"kl": 0.08750152587890625,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0112,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03379952535033226,
"mask/share_reasoning": 0.8321336507797241,
"mask/share_step_conf": 0.13406683504581451,
"num_tokens": 41460567.0,
"reward": 1.1437525749206543,
"reward_std": 0.20699778199195862,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.713333249092102,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8385103940963745,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.6123881340026855,
"adv/mean_abs_reasoning": 0.47117334604263306,
"adv/mean_abs_step_conf": 0.7513047456741333,
"adv/ratio_final_to_reasoning": 1.2997087784063128,
"adv/ratio_step_to_reasoning": 1.5945399967640639,
"adv/std_final_conf": 0.8289145231246948,
"adv/std_reasoning": 0.7391869425773621,
"adv/std_step_conf": 0.9340299963951111,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8090897817460319,
"calib/avg_num_step_conf": 5.796875,
"calib/ece": 0.2392125984251967,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5236220472440944,
"calib/gap": 0.4368229166666666,
"calib/mean_conf": 0.6165354330708661,
"calib/mu_c": 0.8366666666666667,
"calib/mu_w": 0.39984375000000005,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.17984251968503923,
"calib/std_conf": 0.446855818760644,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5040673211781206,
"calib/step_q_c_n": 713.0,
"calib/step_q_gap": 0.20187536268266015,
"calib/step_q_w": 0.30219195849546043,
"calib/step_q_w_n": 771.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2761.0,
"completions/max_terminated_length": 2761.0,
"completions/mean_length": 429.00390625,
"completions/mean_terminated_length": 430.6863098144531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.05041556805372238,
"kl": 0.1006317138671875,
"learning_rate": 5.277777777777779e-07,
"loss": -0.0221,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03834264352917671,
"mask/share_reasoning": 0.8177053332328796,
"mask/share_step_conf": 0.14004576206207275,
"num_tokens": 41676656.0,
"reward": 1.1766166687011719,
"reward_std": 0.21318820118904114,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.740514874458313,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8782706260681152,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.5687905550003052,
"adv/mean_abs_reasoning": 0.3563820719718933,
"adv/mean_abs_step_conf": 0.7398570775985718,
"adv/ratio_final_to_reasoning": 1.5960133792733655,
"adv/ratio_step_to_reasoning": 2.076022156515555,
"adv/std_final_conf": 0.7982797622680664,
"adv/std_reasoning": 0.6815344095230103,
"adv/std_step_conf": 0.9340426325798035,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8294670846394984,
"calib/avg_num_step_conf": 6.7890625,
"calib/ece": 0.17182539682539683,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6428571428571429,
"calib/gap": 0.4777220480668757,
"calib/mean_conf": 0.719920634920635,
"calib/mu_c": 0.8848484848484849,
"calib/mu_w": 0.4071264367816092,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.11849206349206347,
"calib/std_conf": 0.41007064051102304,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4741460905349794,
"calib/step_q_c_n": 972.0,
"calib/step_q_gap": 0.17635235685351724,
"calib/step_q_w": 0.29779373368146217,
"calib/step_q_w_n": 766.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2475.0,
"completions/max_terminated_length": 2475.0,
"completions/mean_length": 477.92578125,
"completions/mean_terminated_length": 479.8000183105469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.05087687447667122,
"kl": 0.08599090576171875,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0218,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.035774778574705124,
"mask/share_reasoning": 0.8111280202865601,
"mask/share_step_conf": 0.14919093251228333,
"num_tokens": 41905165.0,
"reward": 1.2298622131347656,
"reward_std": 0.19393043220043182,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.8012698888778687,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8889696002006531,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.6119903922080994,
"adv/mean_abs_reasoning": 0.5624978542327881,
"adv/mean_abs_step_conf": 0.7621275186538696,
"adv/ratio_final_to_reasoning": 1.087987069822366,
"adv/ratio_step_to_reasoning": 1.354898535023505,
"adv/std_final_conf": 0.8451369404792786,
"adv/std_reasoning": 0.8098740577697754,
"adv/std_step_conf": 0.9352811574935913,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6952368658835678,
"calib/avg_num_step_conf": 6.1875,
"calib/ece": 0.335582329317269,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5542168674698795,
"calib/gap": 0.2515296243707241,
"calib/mean_conf": 0.6394377510040161,
"calib/mu_c": 0.7626771653543307,
"calib/mu_w": 0.5111475409836066,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.23248995983935736,
"calib/std_conf": 0.4430531225691059,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.46950437317784255,
"calib/step_q_c_n": 686.0,
"calib/step_q_gap": 0.12947096560545945,
"calib/step_q_w": 0.3400334075723831,
"calib/step_q_w_n": 898.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1922.0,
"completions/max_terminated_length": 1922.0,
"completions/mean_length": 480.74609375,
"completions/mean_terminated_length": 484.531494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.1952,
"grad_norm": 0.034335847944021225,
"kl": 0.1324462890625,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.1044,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03612757846713066,
"mask/share_reasoning": 0.8249562978744507,
"mask/share_step_conf": 0.13110360503196716,
"num_tokens": 42134916.0,
"reward": 1.0904525518417358,
"reward_std": 0.23203977942466736,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.6375433206558228,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8341160416603088,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.5991448163986206,
"adv/mean_abs_reasoning": 0.4674597978591919,
"adv/mean_abs_step_conf": 0.7482036352157593,
"adv/ratio_final_to_reasoning": 1.2817034088118415,
"adv/ratio_step_to_reasoning": 1.600573222857408,
"adv/std_final_conf": 0.8104008436203003,
"adv/std_reasoning": 0.7207074761390686,
"adv/std_step_conf": 0.9341971278190613,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7524910767400357,
"calib/avg_num_step_conf": 7.1953125,
"calib/ece": 0.2244715447154471,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6747967479674797,
"calib/gap": 0.38737804878048776,
"calib/mean_conf": 0.7295934959349593,
"calib/mu_c": 0.8587195121951219,
"calib/mu_w": 0.4713414634146341,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1436991869918699,
"calib/std_conf": 0.41417936959921076,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.46252849740932644,
"calib/step_q_c_n": 965.0,
"calib/step_q_gap": 0.1756824312747769,
"calib/step_q_w": 0.28684606613454955,
"calib/step_q_w_n": 877.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2790.0,
"completions/max_terminated_length": 2790.0,
"completions/mean_length": 491.4765625,
"completions/mean_terminated_length": 499.2778015136719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.05411386862397194,
"kl": 0.22634124755859375,
"learning_rate": 4.444444444444445e-07,
"loss": 0.0824,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03469008952379227,
"mask/share_reasoning": 0.8151426911354065,
"mask/share_step_conf": 0.13454222679138184,
"num_tokens": 42366014.0,
"reward": 1.163267731666565,
"reward_std": 0.20311492681503296,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7441898584365845,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8413553237915039,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.5832977294921875,
"adv/mean_abs_reasoning": 0.41579145193099976,
"adv/mean_abs_step_conf": 0.7531085014343262,
"adv/ratio_final_to_reasoning": 1.4028612824608653,
"adv/ratio_step_to_reasoning": 1.8112649933921776,
"adv/std_final_conf": 0.7961451411247253,
"adv/std_reasoning": 0.6816564798355103,
"adv/std_step_conf": 0.9346182346343994,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.844640434192673,
"calib/avg_num_step_conf": 6.64453125,
"calib/ece": 0.2341803278688525,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.6885245901639344,
"calib/gap": 0.47905834464043406,
"calib/mean_conf": 0.7341803278688525,
"calib/mu_c": 0.9501492537313432,
"calib/mu_w": 0.4710909090909091,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20959016393442625,
"calib/std_conf": 0.41856378201297734,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.49303072625698324,
"calib/step_q_c_n": 716.0,
"calib/step_q_gap": 0.20566016788134872,
"calib/step_q_w": 0.2873705583756345,
"calib/step_q_w_n": 985.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3014.0,
"completions/max_terminated_length": 3014.0,
"completions/mean_length": 494.7421875,
"completions/mean_terminated_length": 502.59527587890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.04859265312552452,
"kl": 0.07753753662109375,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.0234,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03509043902158737,
"mask/share_reasoning": 0.8189756274223328,
"mask/share_step_conf": 0.13030895590782166,
"num_tokens": 42599588.0,
"reward": 1.1451895236968994,
"reward_std": 0.2622781991958618,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7397312521934509,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8374109268188477,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.5210508108139038,
"adv/mean_abs_reasoning": 0.4355603754520416,
"adv/mean_abs_step_conf": 0.7629748582839966,
"adv/ratio_final_to_reasoning": 1.1962768887622912,
"adv/ratio_step_to_reasoning": 1.751708606395041,
"adv/std_final_conf": 0.7428054809570312,
"adv/std_reasoning": 0.7013394832611084,
"adv/std_step_conf": 0.9342164993286133,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7337121712699881,
"calib/avg_num_step_conf": 6.21484375,
"calib/ece": 0.2501568627450981,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5843137254901961,
"calib/gap": 0.38337319941852777,
"calib/mean_conf": 0.6436470588235295,
"calib/mu_c": 0.7849689440993789,
"calib/mu_w": 0.40159574468085113,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1312156862745098,
"calib/std_conf": 0.4509776231799052,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4645993413830955,
"calib/step_q_c_n": 911.0,
"calib/step_q_gap": 0.15331992961838958,
"calib/step_q_w": 0.3112794117647059,
"calib/step_q_w_n": 680.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2294.0,
"completions/max_terminated_length": 2294.0,
"completions/mean_length": 479.3515625,
"completions/mean_terminated_length": 479.3515625,
"completions/min_length": 99.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.1984,
"grad_norm": 0.042646367102861404,
"kl": 0.0824432373046875,
"learning_rate": 3.8888888888888895e-07,
"loss": 0.0169,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03666163980960846,
"mask/share_reasoning": 0.82917320728302,
"mask/share_step_conf": 0.13416513800621033,
"num_tokens": 42827342.0,
"reward": 1.193156123161316,
"reward_std": 0.17940464615821838,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7355316281318665,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8843746185302734,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.6884365081787109,
"adv/mean_abs_reasoning": 0.6354086399078369,
"adv/mean_abs_step_conf": 0.7570756673812866,
"adv/ratio_final_to_reasoning": 1.0834547485513661,
"adv/ratio_step_to_reasoning": 1.1914783964711229,
"adv/std_final_conf": 0.8458472490310669,
"adv/std_reasoning": 0.8267138600349426,
"adv/std_step_conf": 0.9348864555358887,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7563992611162422,
"calib/avg_num_step_conf": 7.09375,
"calib/ece": 0.25024096385542177,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6305220883534136,
"calib/gap": 0.35983375115450594,
"calib/mean_conf": 0.6989156626506025,
"calib/mu_c": 0.8520979020979021,
"calib/mu_w": 0.4922641509433962,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.18742971887550208,
"calib/std_conf": 0.41745455096831713,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4724045801526718,
"calib/step_q_c_n": 786.0,
"calib/step_q_gap": 0.17693856073519604,
"calib/step_q_w": 0.29546601941747574,
"calib/step_q_w_n": 1030.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2754.0,
"completions/max_terminated_length": 2754.0,
"completions/mean_length": 535.2734375,
"completions/mean_terminated_length": 537.37255859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.03536956384778023,
"kl": 0.07340240478515625,
"learning_rate": 3.611111111111111e-07,
"loss": 0.0515,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03275679051876068,
"mask/share_reasoning": 0.8340227603912354,
"mask/share_step_conf": 0.12931418418884277,
"num_tokens": 43065916.0,
"reward": 1.146524429321289,
"reward_std": 0.2738466262817383,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7100933790206909,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8526995182037354,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.5937135815620422,
"adv/mean_abs_reasoning": 0.4579869210720062,
"adv/mean_abs_step_conf": 0.723059892654419,
"adv/ratio_final_to_reasoning": 1.296354883175139,
"adv/ratio_step_to_reasoning": 1.578778474638443,
"adv/std_final_conf": 0.8131174445152283,
"adv/std_reasoning": 0.7394041419029236,
"adv/std_step_conf": 0.9349931478500366,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.8136234961075726,
"calib/avg_num_step_conf": 6.890625,
"calib/ece": 0.21846153846153854,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6275303643724697,
"calib/gap": 0.42784713375796185,
"calib/mean_conf": 0.6959514170040485,
"calib/mu_c": 0.8518471337579618,
"calib/mu_w": 0.424,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1393927125506074,
"calib/std_conf": 0.42652418481345883,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4834907597535934,
"calib/step_q_c_n": 974.0,
"calib/step_q_gap": 0.18722493696878328,
"calib/step_q_w": 0.2962658227848101,
"calib/step_q_w_n": 790.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3044.0,
"completions/max_terminated_length": 3044.0,
"completions/mean_length": 546.60546875,
"completions/mean_terminated_length": 550.909423828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.04171891510486603,
"kl": 0.0738525390625,
"learning_rate": 3.3333333333333335e-07,
"loss": -0.0148,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.033632516860961914,
"mask/share_reasoning": 0.8178136944770813,
"mask/share_step_conf": 0.14074131846427917,
"num_tokens": 43309919.0,
"reward": 1.171053171157837,
"reward_std": 0.25949686765670776,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7496523261070251,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8517402410507202,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.5931285619735718,
"adv/mean_abs_reasoning": 0.42756614089012146,
"adv/mean_abs_step_conf": 0.7624196410179138,
"adv/ratio_final_to_reasoning": 1.3872206081117109,
"adv/ratio_step_to_reasoning": 1.7831618739282844,
"adv/std_final_conf": 0.8115628957748413,
"adv/std_reasoning": 0.7205988168716431,
"adv/std_step_conf": 0.9347566962242126,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7398329933707293,
"calib/avg_num_step_conf": 5.66796875,
"calib/ece": 0.2790944881889764,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.547244094488189,
"calib/gap": 0.3474655787863335,
"calib/mean_conf": 0.6197244094488189,
"calib/mu_c": 0.7647297297297297,
"calib/mu_w": 0.41726415094339625,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.15807086614173235,
"calib/std_conf": 0.45681661540802154,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.45375615763546795,
"calib/step_q_c_n": 812.0,
"calib/step_q_gap": 0.1683101482457966,
"calib/step_q_w": 0.28544600938967135,
"calib/step_q_w_n": 639.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1661.0,
"completions/max_terminated_length": 1661.0,
"completions/mean_length": 459.51953125,
"completions/mean_terminated_length": 459.51953125,
"completions/min_length": 133.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.2016,
"grad_norm": 0.046347636729478836,
"kl": 0.0937652587890625,
"learning_rate": 3.055555555555556e-07,
"loss": -0.0272,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.036958396434783936,
"mask/share_reasoning": 0.8306939601898193,
"mask/share_step_conf": 0.13234764337539673,
"num_tokens": 43535324.0,
"reward": 1.1333717107772827,
"reward_std": 0.22686219215393066,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6966671347618103,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8404673933982849,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.610427975654602,
"adv/mean_abs_reasoning": 0.4907395839691162,
"adv/mean_abs_step_conf": 0.7486152648925781,
"adv/ratio_final_to_reasoning": 1.2438939013589296,
"adv/ratio_step_to_reasoning": 1.5254837582853125,
"adv/std_final_conf": 0.8305718898773193,
"adv/std_reasoning": 0.7575461864471436,
"adv/std_step_conf": 0.9347136616706848,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.793721386527142,
"calib/avg_num_step_conf": 6.2421875,
"calib/ece": 0.23028112449799204,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5783132530120482,
"calib/gap": 0.4535696533682146,
"calib/mean_conf": 0.6357429718875502,
"calib/mu_c": 0.8361151079136692,
"calib/mu_w": 0.3825454545454546,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15389558232931733,
"calib/std_conf": 0.4508371844338888,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4700248756218905,
"calib/step_q_c_n": 804.0,
"calib/step_q_gap": 0.1532364625236537,
"calib/step_q_w": 0.3167884130982368,
"calib/step_q_w_n": 794.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2549.0,
"completions/max_terminated_length": 2549.0,
"completions/mean_length": 544.6953125,
"completions/mean_terminated_length": 544.6953125,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.05424446985125542,
"kl": 0.07772064208984375,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.0097,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03243604302406311,
"mask/share_reasoning": 0.8429285287857056,
"mask/share_step_conf": 0.12463542073965073,
"num_tokens": 43780374.0,
"reward": 1.1627497673034668,
"reward_std": 0.25788241624832153,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7429359555244446,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8534798622131348,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.6643708348274231,
"adv/mean_abs_reasoning": 0.514706015586853,
"adv/mean_abs_step_conf": 0.7687493562698364,
"adv/ratio_final_to_reasoning": 1.2907772878269677,
"adv/ratio_step_to_reasoning": 1.4935697912784844,
"adv/std_final_conf": 0.8442904949188232,
"adv/std_reasoning": 0.7753342986106873,
"adv/std_step_conf": 0.9353137016296387,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.727679176263379,
"calib/avg_num_step_conf": 6.828125,
"calib/ece": 0.2772427983539095,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.6460905349794238,
"calib/gap": 0.36340739737162975,
"calib/mean_conf": 0.7232510288065843,
"calib/mu_c": 0.905702479338843,
"calib/mu_w": 0.5422950819672132,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.25127572016460914,
"calib/std_conf": 0.4076214842634009,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5024079320113314,
"calib/step_q_c_n": 706.0,
"calib/step_q_gap": 0.1735883542762066,
"calib/step_q_w": 0.32881957773512477,
"calib/step_q_w_n": 1042.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3017.0,
"completions/max_terminated_length": 3017.0,
"completions/mean_length": 492.6484375,
"completions/mean_terminated_length": 498.4901428222656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.037491872906684875,
"kl": 0.10839080810546875,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.0836,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.03837917000055313,
"mask/share_reasoning": 0.8010759353637695,
"mask/share_step_conf": 0.14882618188858032,
"num_tokens": 44010660.0,
"reward": 1.0963408946990967,
"reward_std": 0.24359621107578278,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.6784878969192505,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8198792934417725,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.6128907203674316,
"adv/mean_abs_reasoning": 0.4897460341453552,
"adv/mean_abs_step_conf": 0.7631070613861084,
"adv/ratio_final_to_reasoning": 1.2514460100467653,
"adv/ratio_step_to_reasoning": 1.5581689450896512,
"adv/std_final_conf": 0.8317660689353943,
"adv/std_reasoning": 0.7574921250343323,
"adv/std_step_conf": 0.9336950778961182,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8238347042694869,
"calib/avg_num_step_conf": 6.01953125,
"calib/ece": 0.19076305220883533,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5783132530120482,
"calib/gap": 0.5040423031727381,
"calib/mean_conf": 0.6564658634538152,
"calib/mu_c": 0.8811594202898553,
"calib/mu_w": 0.37711711711711715,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.14650602409638552,
"calib/std_conf": 0.43538419519899174,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4674747474747475,
"calib/step_q_c_n": 792.0,
"calib/step_q_gap": 0.15396339901012795,
"calib/step_q_w": 0.31351134846461953,
"calib/step_q_w_n": 749.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2187.0,
"completions/max_terminated_length": 2187.0,
"completions/mean_length": 487.4140625,
"completions/mean_terminated_length": 487.4140625,
"completions/min_length": 113.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.2048,
"grad_norm": 0.047550417482852936,
"kl": 0.0874786376953125,
"learning_rate": 2.2222222222222224e-07,
"loss": -0.0223,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.038757845759391785,
"mask/share_reasoning": 0.8241227865219116,
"mask/share_step_conf": 0.137119323015213,
"num_tokens": 44240414.0,
"reward": 1.1892247200012207,
"reward_std": 0.200741708278656,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7761476039886475,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.867159366607666,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.6267382502555847,
"adv/mean_abs_reasoning": 0.558899462223053,
"adv/mean_abs_step_conf": 0.7465231418609619,
"adv/ratio_final_to_reasoning": 1.1213792329709877,
"adv/ratio_step_to_reasoning": 1.3357020221340445,
"adv/std_final_conf": 0.8460562229156494,
"adv/std_reasoning": 0.8097629547119141,
"adv/std_step_conf": 0.9347980618476868,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8055693376333416,
"calib/avg_num_step_conf": 6.22265625,
"calib/ece": 0.22145098039215677,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5058823529411764,
"calib/gap": 0.44473517737534113,
"calib/mean_conf": 0.5933725490196079,
"calib/mu_c": 0.7956834532374102,
"calib/mu_w": 0.35094827586206906,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1348627450980391,
"calib/std_conf": 0.4485283107240316,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4387948717948718,
"calib/step_q_c_n": 780.0,
"calib/step_q_gap": 0.11264481029425683,
"calib/step_q_w": 0.326150061500615,
"calib/step_q_w_n": 813.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2860.0,
"completions/max_terminated_length": 2860.0,
"completions/mean_length": 473.18359375,
"completions/mean_terminated_length": 473.18359375,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.04571477696299553,
"kl": 0.08908843994140625,
"learning_rate": 1.9444444444444447e-07,
"loss": 0.0191,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03606698662042618,
"mask/share_reasoning": 0.8280755281448364,
"mask/share_step_conf": 0.13585752248764038,
"num_tokens": 44467261.0,
"reward": 1.199505090713501,
"reward_std": 0.19832319021224976,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7623148560523987,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8864426016807556,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.6319118142127991,
"adv/mean_abs_reasoning": 0.549137532711029,
"adv/mean_abs_step_conf": 0.7619778513908386,
"adv/ratio_final_to_reasoning": 1.1507350646624406,
"adv/ratio_step_to_reasoning": 1.3875901864311502,
"adv/std_final_conf": 0.8473681211471558,
"adv/std_reasoning": 0.8098301887512207,
"adv/std_step_conf": 0.9348081350326538,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7936793422404933,
"calib/avg_num_step_conf": 5.85546875,
"calib/ece": 0.24752948207171316,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6095617529880478,
"calib/gap": 0.41372245632065763,
"calib/mean_conf": 0.6714346613545817,
"calib/mu_c": 0.8560438848920863,
"calib/mu_w": 0.44232142857142864,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18258964143426298,
"calib/std_conf": 0.4427708453228523,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.48518471337579616,
"calib/step_q_c_n": 785.0,
"calib/step_q_gap": 0.1487841531517065,
"calib/step_q_w": 0.33640056022408965,
"calib/step_q_w_n": 714.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2562.0,
"completions/max_terminated_length": 2562.0,
"completions/mean_length": 454.81640625,
"completions/mean_terminated_length": 456.60003662109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.05285657197237015,
"kl": 0.087890625,
"learning_rate": 1.6666666666666668e-07,
"loss": 0.0059,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.037625670433044434,
"mask/share_reasoning": 0.822894275188446,
"mask/share_step_conf": 0.13557374477386475,
"num_tokens": 44689638.0,
"reward": 1.1636325120925903,
"reward_std": 0.24643751978874207,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.728967547416687,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8629273772239685,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.6661218404769897,
"adv/mean_abs_reasoning": 0.4485500156879425,
"adv/mean_abs_step_conf": 0.72472083568573,
"adv/ratio_final_to_reasoning": 1.4850558849169957,
"adv/ratio_step_to_reasoning": 1.615696823851903,
"adv/std_final_conf": 0.8545449376106262,
"adv/std_reasoning": 0.7206267714500427,
"adv/std_step_conf": 0.9352039694786072,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7691960252935863,
"calib/avg_num_step_conf": 6.16015625,
"calib/ece": 0.19893442622950816,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.5860655737704918,
"calib/gap": 0.470840108401084,
"calib/mean_conf": 0.675655737704918,
"calib/mu_c": 0.8338888888888889,
"calib/mu_w": 0.3630487804878049,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.10532786885245898,
"calib/std_conf": 0.43194183675111997,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4587527352297593,
"calib/step_q_c_n": 914.0,
"calib/step_q_gap": 0.19261397203217256,
"calib/step_q_w": 0.2661387631975867,
"calib/step_q_w_n": 663.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3001.0,
"completions/max_terminated_length": 3001.0,
"completions/mean_length": 457.61328125,
"completions/mean_terminated_length": 468.59600830078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.208,
"grad_norm": 0.0644913911819458,
"kl": 0.089080810546875,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.1316,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.036113277077674866,
"mask/share_reasoning": 0.8115068674087524,
"mask/share_step_conf": 0.12894240021705627,
"num_tokens": 44912771.0,
"reward": 1.15213143825531,
"reward_std": 0.24076297879219055,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.754950761795044,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.822458028793335,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.5296871662139893,
"adv/mean_abs_reasoning": 0.45094752311706543,
"adv/mean_abs_step_conf": 0.7463574409484863,
"adv/ratio_final_to_reasoning": 1.1746093260534067,
"adv/ratio_step_to_reasoning": 1.6550871280752835,
"adv/std_final_conf": 0.7749639749526978,
"adv/std_reasoning": 0.739151656627655,
"adv/std_step_conf": 0.9348659515380859,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7914329423763387,
"calib/avg_num_step_conf": 5.43359375,
"calib/ece": 0.2077952755905511,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6850393700787402,
"calib/gap": 0.4285938296787352,
"calib/mean_conf": 0.7494488188976377,
"calib/mu_c": 0.9283108108108107,
"calib/mu_w": 0.49971698113207547,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18728346456692907,
"calib/std_conf": 0.39647920398366215,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5182375478927204,
"calib/step_q_c_n": 783.0,
"calib/step_q_gap": 0.11843491631377301,
"calib/step_q_w": 0.39980263157894735,
"calib/step_q_w_n": 608.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2012.0,
"completions/max_terminated_length": 2012.0,
"completions/mean_length": 391.96484375,
"completions/mean_terminated_length": 391.96484375,
"completions/min_length": 132.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.04935779422521591,
"kl": 0.0960540771484375,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0108,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04187465459108353,
"mask/share_reasoning": 0.8143314123153687,
"mask/share_step_conf": 0.14379391074180603,
"num_tokens": 45115658.0,
"reward": 1.1880606412887573,
"reward_std": 0.16772204637527466,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7740910053253174,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8591660261154175,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.6387770175933838,
"adv/mean_abs_reasoning": 0.5504963994026184,
"adv/mean_abs_step_conf": 0.7230050563812256,
"adv/ratio_final_to_reasoning": 1.1603654779333066,
"adv/ratio_step_to_reasoning": 1.3133692739240588,
"adv/std_final_conf": 0.8457547426223755,
"adv/std_reasoning": 0.8098015189170837,
"adv/std_step_conf": 0.9345408082008362,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7607559055118109,
"calib/avg_num_step_conf": 6.5,
"calib/ece": 0.2756746031746031,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5833333333333334,
"calib/gap": 0.36511937007874007,
"calib/mean_conf": 0.6560714285714286,
"calib/mu_c": 0.8400799999999999,
"calib/mu_w": 0.47496062992125987,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21785714285714278,
"calib/std_conf": 0.43884141617303263,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.461313672922252,
"calib/step_q_c_n": 746.0,
"calib/step_q_gap": 0.14207620015536754,
"calib/step_q_w": 0.31923747276688447,
"calib/step_q_w_n": 918.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2627.0,
"completions/max_terminated_length": 2627.0,
"completions/mean_length": 478.875,
"completions/mean_terminated_length": 484.5533752441406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.04360397160053253,
"kl": 0.08504486083984375,
"learning_rate": 8.333333333333334e-08,
"loss": -0.0231,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.0352955237030983,
"mask/share_reasoning": 0.8155674934387207,
"mask/share_step_conf": 0.1374182403087616,
"num_tokens": 45343306.0,
"reward": 1.1531352996826172,
"reward_std": 0.22259561717510223,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7032074332237244,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8723545074462891,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.5296421647071838,
"adv/mean_abs_reasoning": 0.46181297302246094,
"adv/mean_abs_step_conf": 0.7478198409080505,
"adv/ratio_final_to_reasoning": 1.146875890559757,
"adv/ratio_step_to_reasoning": 1.6193131951528767,
"adv/std_final_conf": 0.759784996509552,
"adv/std_reasoning": 0.7206684947013855,
"adv/std_step_conf": 0.934968888759613,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8539823008849556,
"calib/avg_num_step_conf": 6.46484375,
"calib/ece": 0.1557539682539683,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5476190476190477,
"calib/gap": 0.5803622588654739,
"calib/mean_conf": 0.6147222222222223,
"calib/mu_c": 0.8749640287769783,
"calib/mu_w": 0.29460176991150444,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.10944444444444448,
"calib/std_conf": 0.45056085223825754,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4732279495990836,
"calib/step_q_c_n": 873.0,
"calib/step_q_gap": 0.1517062104686489,
"calib/step_q_w": 0.3215217391304347,
"calib/step_q_w_n": 782.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2979.0,
"completions/max_terminated_length": 2979.0,
"completions/mean_length": 458.8359375,
"completions/mean_terminated_length": 460.63531494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.2112,
"grad_norm": 0.03705943003296852,
"kl": 0.09426116943359375,
"learning_rate": 5.555555555555556e-08,
"loss": 0.0507,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.039732448756694794,
"mask/share_reasoning": 0.8041092157363892,
"mask/share_step_conf": 0.15225210785865784,
"num_tokens": 45566152.0,
"reward": 1.2149150371551514,
"reward_std": 0.18363887071609497,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.815900444984436,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8728281259536743,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.6824038028717041,
"adv/mean_abs_reasoning": 0.6048569679260254,
"adv/mean_abs_step_conf": 0.74017333984375,
"adv/ratio_final_to_reasoning": 1.1282068969323054,
"adv/ratio_step_to_reasoning": 1.2237163149193875,
"adv/std_final_conf": 0.876178503036499,
"adv/std_reasoning": 0.826652467250824,
"adv/std_step_conf": 0.9352187514305115,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7021883289124669,
"calib/avg_num_step_conf": 6.5078125,
"calib/ece": 0.2850200803212851,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5983935742971888,
"calib/gap": 0.2835285145888594,
"calib/mean_conf": 0.6811646586345381,
"calib/mu_c": 0.7995862068965517,
"calib/mu_w": 0.5160576923076923,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19192771084337348,
"calib/std_conf": 0.42563329734788,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.44136732329084594,
"calib/step_q_c_n": 863.0,
"calib/step_q_gap": 0.12920044906917716,
"calib/step_q_w": 0.3121668742216688,
"calib/step_q_w_n": 803.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2531.0,
"completions/max_terminated_length": 2531.0,
"completions/mean_length": 503.546875,
"completions/mean_terminated_length": 509.5177917480469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.059340838342905045,
"kl": 0.0843505859375,
"learning_rate": 2.777777777777778e-08,
"loss": -0.1241,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03569798544049263,
"mask/share_reasoning": 0.8174113035202026,
"mask/share_step_conf": 0.13517196476459503,
"num_tokens": 45799260.0,
"reward": 1.1359705924987793,
"reward_std": 0.27152296900749207,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6812281012535095,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8563086986541748,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.544896125793457,
"adv/mean_abs_reasoning": 0.46680164337158203,
"adv/mean_abs_step_conf": 0.7455019950866699,
"adv/ratio_final_to_reasoning": 1.1672969312143369,
"adv/ratio_step_to_reasoning": 1.5970423533690041,
"adv/std_final_conf": 0.7886665463447571,
"adv/std_reasoning": 0.720587432384491,
"adv/std_step_conf": 0.9341596961021423,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8773305813553491,
"calib/avg_num_step_conf": 6.453125,
"calib/ece": 0.16403225806451616,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6491935483870968,
"calib/gap": 0.5541093911248711,
"calib/mean_conf": 0.6891129032258064,
"calib/mu_c": 0.9013725490196078,
"calib/mu_w": 0.3472631578947368,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.11810483870967745,
"calib/std_conf": 0.43646307920708033,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.500355421686747,
"calib/step_q_c_n": 830.0,
"calib/step_q_gap": 0.2503432562366253,
"calib/step_q_w": 0.25001216545012167,
"calib/step_q_w_n": 822.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2616.0,
"completions/max_terminated_length": 2616.0,
"completions/mean_length": 485.99609375,
"completions/mean_terminated_length": 493.7103576660156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.040903497487306595,
"kl": 0.09577178955078125,
"learning_rate": 0.0,
"loss": -0.1536,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03609899431467056,
"mask/share_reasoning": 0.8205278515815735,
"mask/share_step_conf": 0.12774814665317535,
"num_tokens": 46031723.0,
"reward": 1.1995675563812256,
"reward_std": 0.2133074849843979,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7961195111274719,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8603435754776001,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.015639179518911986,
"train_runtime": 14298.7628,
"train_samples_per_second": 3.581,
"train_steps_per_second": 0.014
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 46031723,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}