Files
PureRL-1.5B-v7-s2-async-l2-…/trainer_state.json
ModelHub XC 09ce115b17 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-async-l2-maskon
Source: Original Platform
2026-06-04 16:20:25 +08:00

12243 lines
504 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.773959219455719,
"adv/mean_abs_reasoning": 0.47714588046073914,
"adv/mean_abs_step_conf": 0.7493494749069214,
"adv/ratio_final_to_reasoning": 1.622059942565935,
"adv/ratio_step_to_reasoning": 1.5704829604383013,
"adv/std_final_conf": 0.9294352531433105,
"adv/std_reasoning": 0.7393431663513184,
"adv/std_step_conf": 0.9337335228919983,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.38076182006817844,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.2003187250996017,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": -0.026059730250481805,
"calib/mean_conf": 0.8737051792828686,
"calib/mu_c": 0.865606936416185,
"calib/mu_w": 0.8916666666666668,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19239043824701207,
"calib/std_conf": 0.09027744273295583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7959393232205367,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.006446568895645877,
"calib/step_q_w": 0.8023858921161826,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 474.94921875,
"completions/mean_terminated_length": 478.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.042860016226768494,
"kl": 0.000291675329208374,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.011,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03466901555657387,
"mask/share_reasoning": 0.8340686559677124,
"mask/share_step_conf": 0.12344987690448761,
"num_tokens": 229171.0,
"reward": 0.8971271514892578,
"reward_std": 0.1976315677165985,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/asymmetric_l2_reward": 0.749505341053009,
"rewards/final_brier_reward_step": 0.7142800688743591,
"rewards/format_reward_step": 0.9765625,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.7672724723815918,
"adv/mean_abs_reasoning": 0.5104547739028931,
"adv/mean_abs_step_conf": 0.773115873336792,
"adv/ratio_final_to_reasoning": 1.503115479781084,
"adv/ratio_step_to_reasoning": 1.5145629208746838,
"adv/std_final_conf": 0.9330522418022156,
"adv/std_reasoning": 0.7575037479400635,
"adv/std_step_conf": 0.9337809085845947,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.44343065693430656,
"calib/avg_num_step_conf": 5.05859375,
"calib/ece": 0.3349411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": 0.002352468143016151,
"calib/mean_conf": 0.8721960784313726,
"calib/mu_c": 0.8732846715328467,
"calib/mu_w": 0.8709322033898306,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3349411764705883,
"calib/std_conf": 0.07627016470309335,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7954391371340525,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.011011892552009073,
"calib/step_q_w": 0.7844272445820434,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 492.9765625,
"completions/mean_terminated_length": 494.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.04081178456544876,
"kl": 0.00037539005279541016,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0106,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03364308178424835,
"mask/share_reasoning": 0.8523939251899719,
"mask/share_step_conf": 0.11005672812461853,
"num_tokens": 458661.0,
"reward": 0.8363707661628723,
"reward_std": 0.19354595243930817,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/asymmetric_l2_reward": 0.7344152927398682,
"rewards/final_brier_reward_step": 0.6320762038230896,
"rewards/format_reward_step": 0.99609375,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.753760814666748,
"adv/mean_abs_reasoning": 0.43374836444854736,
"adv/mean_abs_step_conf": 0.751661479473114,
"adv/ratio_final_to_reasoning": 1.737783647034735,
"adv/ratio_step_to_reasoning": 1.7329436629202057,
"adv/std_final_conf": 0.929868757724762,
"adv/std_reasoning": 0.7013001441955566,
"adv/std_step_conf": 0.9305136799812317,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5068113362541073,
"calib/avg_num_step_conf": 5.0078125,
"calib/ece": 0.22877952755905512,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.3425196850393701,
"calib/gap": 0.004660460021905566,
"calib/mean_conf": 0.8794094488188977,
"calib/mu_c": 0.881024096385542,
"calib/mu_w": 0.8763636363636365,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22732283464566927,
"calib/std_conf": 0.05409278327150863,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7904369538077403,
"calib/step_q_c_n": 801.0,
"calib/step_q_gap": 0.023389136759923157,
"calib/step_q_w": 0.7670478170478171,
"calib/step_q_w_n": 481.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2469.0,
"completions/max_terminated_length": 2469.0,
"completions/mean_length": 508.640625,
"completions/mean_terminated_length": 510.63531494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0032,
"grad_norm": 0.04966992139816284,
"kl": 0.0011971145868301392,
"learning_rate": 7.5e-07,
"loss": 0.0326,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.032452650368213654,
"mask/share_reasoning": 0.8557740449905396,
"mask/share_step_conf": 0.10786702483892441,
"num_tokens": 694129.0,
"reward": 0.9003467559814453,
"reward_std": 0.16390666365623474,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/asymmetric_l2_reward": 0.7596950531005859,
"rewards/final_brier_reward_step": 0.7144359350204468,
"rewards/format_reward_step": 0.984375,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.7722760438919067,
"adv/mean_abs_reasoning": 0.4103483259677887,
"adv/mean_abs_step_conf": 0.7630362510681152,
"adv/ratio_final_to_reasoning": 1.8820012048800911,
"adv/ratio_step_to_reasoning": 1.859484254671997,
"adv/std_final_conf": 0.9281506538391113,
"adv/std_reasoning": 0.6815478205680847,
"adv/std_step_conf": 0.9337376952171326,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4910992283741709,
"calib/avg_num_step_conf": 5.30859375,
"calib/ece": 0.22568627450980389,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.22745098039215686,
"calib/gap": 0.0012657371057265276,
"calib/mean_conf": 0.8766666666666667,
"calib/mu_c": 0.8771084337349399,
"calib/mu_w": 0.8758426966292133,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22568627450980389,
"calib/std_conf": 0.04235548285764873,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.798277108433735,
"calib/step_q_c_n": 830.0,
"calib/step_q_gap": 0.006386749265493097,
"calib/step_q_w": 0.7918903591682419,
"calib/step_q_w_n": 529.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2305.0,
"completions/max_terminated_length": 2305.0,
"completions/mean_length": 510.30078125,
"completions/mean_terminated_length": 510.30078125,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.0446762815117836,
"kl": 0.0002792179584503174,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.046,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03316652029752731,
"mask/share_reasoning": 0.8474521040916443,
"mask/share_step_conf": 0.1193813905119896,
"num_tokens": 930934.0,
"reward": 0.8870111703872681,
"reward_std": 0.16794714331626892,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/asymmetric_l2_reward": 0.736449658870697,
"rewards/final_brier_reward_step": 0.7102289199829102,
"rewards/format_reward_step": 0.98828125,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.7767199873924255,
"adv/mean_abs_reasoning": 0.39009517431259155,
"adv/mean_abs_step_conf": 0.7709915637969971,
"adv/ratio_final_to_reasoning": 1.9911038088618427,
"adv/ratio_step_to_reasoning": 1.9764191268338662,
"adv/std_final_conf": 0.9301847219467163,
"adv/std_reasoning": 0.6612535119056702,
"adv/std_step_conf": 0.9337782263755798,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.4286752080306432,
"calib/avg_num_step_conf": 4.9609375,
"calib/ece": 0.33842105263157896,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.2874493927125506,
"calib/gap": -0.011496499801875615,
"calib/mean_conf": 0.880931174089069,
"calib/mu_c": 0.8756716417910448,
"calib/mu_w": 0.8871681415929205,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.33842105263157896,
"calib/std_conf": 0.04485179508228513,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.8001156069364163,
"calib/step_q_c_n": 692.0,
"calib/step_q_gap": 0.007139828389703395,
"calib/step_q_w": 0.7929757785467129,
"calib/step_q_w_n": 578.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2878.0,
"completions/max_terminated_length": 2878.0,
"completions/mean_length": 524.61328125,
"completions/mean_terminated_length": 526.6705932617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.053633056581020355,
"kl": 0.000286102294921875,
"learning_rate": 1.25e-06,
"loss": -0.0005,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03404708951711655,
"mask/share_reasoning": 0.850751519203186,
"mask/share_step_conf": 0.111295185983181,
"num_tokens": 1171923.0,
"reward": 0.7870633602142334,
"reward_std": 0.16195642948150635,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/asymmetric_l2_reward": 0.6709086894989014,
"rewards/final_brier_reward_step": 0.6063430309295654,
"rewards/format_reward_step": 0.9609375,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.7977774739265442,
"adv/mean_abs_reasoning": 0.42455434799194336,
"adv/mean_abs_step_conf": 0.7404891848564148,
"adv/ratio_final_to_reasoning": 1.879093872668767,
"adv/ratio_step_to_reasoning": 1.744156403906307,
"adv/std_final_conf": 0.9312154054641724,
"adv/std_reasoning": 0.6816370487213135,
"adv/std_step_conf": 0.9339516758918762,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.433967112024666,
"calib/avg_num_step_conf": 5.140625,
"calib/ece": 0.3313545816733068,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.33067729083665337,
"calib/gap": -0.009260662898253003,
"calib/mean_conf": 0.8851394422310758,
"calib/mu_c": 0.8810071942446042,
"calib/mu_w": 0.8902678571428572,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3313545816733068,
"calib/std_conf": 0.04471557388534855,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7983999999999999,
"calib/step_q_c_n": 675.0,
"calib/step_q_gap": -0.011537597503900376,
"calib/step_q_w": 0.8099375975039003,
"calib/step_q_w_n": 641.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2532.0,
"completions/max_terminated_length": 2532.0,
"completions/mean_length": 464.72265625,
"completions/mean_terminated_length": 464.72265625,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.0064,
"grad_norm": 0.051193155348300934,
"kl": 0.00040727853775024414,
"learning_rate": 1.5e-06,
"loss": 0.0986,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.0360930897295475,
"mask/share_reasoning": 0.8376978635787964,
"mask/share_step_conf": 0.12620902061462402,
"num_tokens": 1396844.0,
"reward": 0.8131352663040161,
"reward_std": 0.18004879355430603,
"rewards/accuracy_reward_step": 0.546875,
"rewards/asymmetric_l2_reward": 0.6967132091522217,
"rewards/final_brier_reward_step": 0.6240886449813843,
"rewards/format_reward_step": 0.98046875,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.7584425210952759,
"adv/mean_abs_reasoning": 0.47301214933395386,
"adv/mean_abs_step_conf": 0.7564212083816528,
"adv/ratio_final_to_reasoning": 1.60343137520513,
"adv/ratio_step_to_reasoning": 1.5991580965663692,
"adv/std_final_conf": 0.9305387735366821,
"adv/std_reasoning": 0.7393888831138611,
"adv/std_step_conf": 0.9338541030883789,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.47629160284083,
"calib/avg_num_step_conf": 5.03125,
"calib/ece": 0.22399209486166,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.32806324110671936,
"calib/gap": -0.002414705472775225,
"calib/mean_conf": 0.8837549407114624,
"calib/mu_c": 0.8829341317365269,
"calib/mu_w": 0.8853488372093021,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22383399209486157,
"calib/std_conf": 0.04383461580512651,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7935455635491607,
"calib/step_q_c_n": 834.0,
"calib/step_q_gap": 0.0046248587033457245,
"calib/step_q_w": 0.788920704845815,
"calib/step_q_w_n": 454.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2655.0,
"completions/max_terminated_length": 2655.0,
"completions/mean_length": 542.8203125,
"completions/mean_terminated_length": 544.9490356445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.07613871246576309,
"kl": 0.00029200315475463867,
"learning_rate": 1.75e-06,
"loss": 0.1122,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.030497439205646515,
"mask/share_reasoning": 0.8622984290122986,
"mask/share_step_conf": 0.1032978817820549,
"num_tokens": 1643230.0,
"reward": 0.8966531753540039,
"reward_std": 0.20346349477767944,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/asymmetric_l2_reward": 0.757659375667572,
"rewards/final_brier_reward_step": 0.7090843915939331,
"rewards/format_reward_step": 0.98046875,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.7677263617515564,
"adv/mean_abs_reasoning": 0.44562456011772156,
"adv/mean_abs_step_conf": 0.7679376602172852,
"adv/ratio_final_to_reasoning": 1.722809805520469,
"adv/ratio_step_to_reasoning": 1.7232839680434522,
"adv/std_final_conf": 0.929553747177124,
"adv/std_reasoning": 0.7014294862747192,
"adv/std_step_conf": 0.9342193603515625,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5305101373446698,
"calib/avg_num_step_conf": 4.59375,
"calib/ece": 0.32694779116465855,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.3092369477911647,
"calib/gap": 0.004426422498364779,
"calib/mean_conf": 0.8822891566265061,
"calib/mu_c": 0.8842446043165467,
"calib/mu_w": 0.8798181818181819,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.3255020080321285,
"calib/std_conf": 0.04544272094007264,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7964705882352942,
"calib/step_q_c_n": 629.0,
"calib/step_q_gap": 0.03597698677277128,
"calib/step_q_w": 0.7604936014625229,
"calib/step_q_w_n": 547.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2316.0,
"completions/max_terminated_length": 2316.0,
"completions/mean_length": 533.7578125,
"completions/mean_terminated_length": 535.8510131835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.04478863254189491,
"kl": 0.0004049241542816162,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0452,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.032160256057977676,
"mask/share_reasoning": 0.8618191480636597,
"mask/share_step_conf": 0.10211435705423355,
"num_tokens": 1886384.0,
"reward": 0.8338550925254822,
"reward_std": 0.18050694465637207,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/asymmetric_l2_reward": 0.7384202480316162,
"rewards/final_brier_reward_step": 0.6269460916519165,
"rewards/format_reward_step": 0.96875,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.7801663875579834,
"adv/mean_abs_reasoning": 0.43747538328170776,
"adv/mean_abs_step_conf": 0.7640990614891052,
"adv/ratio_final_to_reasoning": 1.7833377999593711,
"adv/ratio_step_to_reasoning": 1.7466104166987415,
"adv/std_final_conf": 0.9302932620048523,
"adv/std_reasoning": 0.7014261484146118,
"adv/std_step_conf": 0.9335131645202637,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.42748740982714034,
"calib/avg_num_step_conf": 4.64453125,
"calib/ece": 0.2562151394422311,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.2908366533864542,
"calib/gap": -0.010103443582414662,
"calib/mean_conf": 0.880199203187251,
"calib/mu_c": 0.8764556962025316,
"calib/mu_w": 0.8865591397849463,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.2534661354581674,
"calib/std_conf": 0.044394556566673175,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.7760684931506849,
"calib/step_q_c_n": 730.0,
"calib/step_q_gap": 0.014957382039573863,
"calib/step_q_w": 0.7611111111111111,
"calib/step_q_w_n": 459.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2822.0,
"completions/max_terminated_length": 2822.0,
"completions/mean_length": 511.79296875,
"completions/mean_terminated_length": 513.800048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.0096,
"grad_norm": 0.042500849813222885,
"kl": 0.000448763370513916,
"learning_rate": 2.25e-06,
"loss": 0.0164,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03407716751098633,
"mask/share_reasoning": 0.8594886660575867,
"mask/share_step_conf": 0.10252793878316879,
"num_tokens": 2124939.0,
"reward": 0.8477847576141357,
"reward_std": 0.20172211527824402,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/asymmetric_l2_reward": 0.7034733295440674,
"rewards/final_brier_reward_step": 0.6749086380004883,
"rewards/format_reward_step": 0.96875,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.7686961889266968,
"adv/mean_abs_reasoning": 0.46870675683021545,
"adv/mean_abs_step_conf": 0.7512601017951965,
"adv/ratio_final_to_reasoning": 1.6400364998474934,
"adv/ratio_step_to_reasoning": 1.6028360821504721,
"adv/std_final_conf": 0.9307101368904114,
"adv/std_reasoning": 0.72056645154953,
"adv/std_step_conf": 0.9345166087150574,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.4519811320754717,
"calib/avg_num_step_conf": 5.2890625,
"calib/ece": 0.30464843750000004,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.375,
"calib/gap": -0.0036691823899371867,
"calib/mean_conf": 0.8905859375,
"calib/mu_c": 0.8890666666666666,
"calib/mu_w": 0.8927358490566037,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30464843750000004,
"calib/std_conf": 0.04735224178691115,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7855445544554456,
"calib/step_q_c_n": 707.0,
"calib/step_q_gap": -0.0018897577547554167,
"calib/step_q_w": 0.787434312210201,
"calib/step_q_w_n": 647.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1378.0,
"completions/max_terminated_length": 1378.0,
"completions/mean_length": 516.8359375,
"completions/mean_terminated_length": 518.86279296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.03771434351801872,
"kl": 0.0006309151649475098,
"learning_rate": 2.5e-06,
"loss": 0.075,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03189965710043907,
"mask/share_reasoning": 0.8523790836334229,
"mask/share_step_conf": 0.11181497573852539,
"num_tokens": 2364049.0,
"reward": 0.8517932891845703,
"reward_std": 0.19309264421463013,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/asymmetric_l2_reward": 0.7258471250534058,
"rewards/final_brier_reward_step": 0.660551905632019,
"rewards/format_reward_step": 1.0,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.7453495264053345,
"adv/mean_abs_reasoning": 0.3924379348754883,
"adv/mean_abs_step_conf": 0.7527601718902588,
"adv/ratio_final_to_reasoning": 1.8992800139003307,
"adv/ratio_step_to_reasoning": 1.9181636253617853,
"adv/std_final_conf": 0.9260469079017639,
"adv/std_reasoning": 0.681506335735321,
"adv/std_step_conf": 0.9340210556983948,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4787878787878788,
"calib/avg_num_step_conf": 5.40625,
"calib/ece": 0.30251968503937005,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.4566929133858268,
"calib/gap": -0.019564679048550082,
"calib/mean_conf": 0.8861417322834646,
"calib/mu_c": 0.878516129032258,
"calib/mu_w": 0.898080808080808,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2892125984251968,
"calib/std_conf": 0.10039329936618835,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7779198966408268,
"calib/step_q_c_n": 774.0,
"calib/step_q_gap": 0.008329732706400583,
"calib/step_q_w": 0.7695901639344263,
"calib/step_q_w_n": 610.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2714.0,
"completions/max_terminated_length": 2714.0,
"completions/mean_length": 531.82421875,
"completions/mean_terminated_length": 531.82421875,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.038181111216545105,
"kl": 0.001034379005432129,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.103,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.031716808676719666,
"mask/share_reasoning": 0.8542863130569458,
"mask/share_step_conf": 0.11399686336517334,
"num_tokens": 2604676.0,
"reward": 0.8596004247665405,
"reward_std": 0.16910496354103088,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/asymmetric_l2_reward": 0.7382351160049438,
"rewards/final_brier_reward_step": 0.6614344120025635,
"rewards/format_reward_step": 0.9921875,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.7681692242622375,
"adv/mean_abs_reasoning": 0.4303590655326843,
"adv/mean_abs_step_conf": 0.7345424294471741,
"adv/ratio_final_to_reasoning": 1.7849495590651097,
"adv/ratio_step_to_reasoning": 1.7068129575427475,
"adv/std_final_conf": 0.9277141094207764,
"adv/std_reasoning": 0.7013878226280212,
"adv/std_step_conf": 0.9326591491699219,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4637932279257816,
"calib/avg_num_step_conf": 5.38671875,
"calib/ece": 0.2204761904761905,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5277777777777778,
"calib/gap": -0.002456140350877156,
"calib/mean_conf": 0.8983333333333333,
"calib/mu_c": 0.8975438596491229,
"calib/mu_w": 0.9,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2201190476190476,
"calib/std_conf": 0.05905673892705567,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7772737819025521,
"calib/step_q_c_n": 862.0,
"calib/step_q_gap": 0.025649023682049243,
"calib/step_q_w": 0.7516247582205029,
"calib/step_q_w_n": 517.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2306.0,
"completions/max_terminated_length": 2306.0,
"completions/mean_length": 467.9765625,
"completions/mean_terminated_length": 471.6614074707031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.0128,
"grad_norm": 0.06263236701488495,
"kl": 0.003935456275939941,
"learning_rate": 3e-06,
"loss": 0.0278,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03686348348855972,
"mask/share_reasoning": 0.8269122838973999,
"mask/share_step_conf": 0.12841171026229858,
"num_tokens": 2828654.0,
"reward": 0.9123976230621338,
"reward_std": 0.17215202748775482,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/asymmetric_l2_reward": 0.7845866680145264,
"rewards/final_brier_reward_step": 0.7113023400306702,
"rewards/format_reward_step": 0.9765625,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.7388582229614258,
"adv/mean_abs_reasoning": 0.41692185401916504,
"adv/mean_abs_step_conf": 0.7494971752166748,
"adv/ratio_final_to_reasoning": 1.7721743675433765,
"adv/ratio_step_to_reasoning": 1.7976922245534819,
"adv/std_final_conf": 0.9258735775947571,
"adv/std_reasoning": 0.7204419374465942,
"adv/std_step_conf": 0.9338845610618591,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.510838445807771,
"calib/avg_num_step_conf": 4.87109375,
"calib/ece": 0.26177865612648216,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5296442687747036,
"calib/gap": 0.004040218132242646,
"calib/mean_conf": 0.9060474308300396,
"calib/mu_c": 0.9074846625766873,
"calib/mu_w": 0.9034444444444446,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.26177865612648216,
"calib/std_conf": 0.04516318373782039,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.766610824742268,
"calib/step_q_c_n": 776.0,
"calib/step_q_gap": 0.026143733447151263,
"calib/step_q_w": 0.7404670912951168,
"calib/step_q_w_n": 471.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2335.0,
"completions/max_terminated_length": 2335.0,
"completions/mean_length": 490.625,
"completions/mean_terminated_length": 490.625,
"completions/min_length": 118.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.04346369951963425,
"kl": 0.002653837203979492,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0747,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0344499908387661,
"mask/share_reasoning": 0.8513206243515015,
"mask/share_step_conf": 0.11422930657863617,
"num_tokens": 3058846.0,
"reward": 0.9044943451881409,
"reward_std": 0.173051655292511,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/asymmetric_l2_reward": 0.7901186943054199,
"rewards/final_brier_reward_step": 0.693869948387146,
"rewards/format_reward_step": 0.98828125,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.7576812505722046,
"adv/mean_abs_reasoning": 0.5075523853302002,
"adv/mean_abs_step_conf": 0.7558131217956543,
"adv/ratio_final_to_reasoning": 1.4928138897018033,
"adv/ratio_step_to_reasoning": 1.489133227704057,
"adv/std_final_conf": 0.9261738061904907,
"adv/std_reasoning": 0.7575864791870117,
"adv/std_step_conf": 0.9350097179412842,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5146290491118077,
"calib/avg_num_step_conf": 5.40234375,
"calib/ece": 0.39185483870967736,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.7419354838709677,
"calib/gap": 0.0006008359456635137,
"calib/mean_conf": 0.9241129032258065,
"calib/mu_c": 0.9243939393939393,
"calib/mu_w": 0.9237931034482758,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.39185483870967736,
"calib/std_conf": 0.03558298108905692,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7218681318681318,
"calib/step_q_c_n": 728.0,
"calib/step_q_gap": 0.01843301736431502,
"calib/step_q_w": 0.7034351145038168,
"calib/step_q_w_n": 655.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2932.0,
"completions/max_terminated_length": 2932.0,
"completions/mean_length": 563.546875,
"completions/mean_terminated_length": 565.7568969726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.04784300550818443,
"kl": 0.007000446319580078,
"learning_rate": 3.5e-06,
"loss": 0.0317,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03211609274148941,
"mask/share_reasoning": 0.8483536243438721,
"mask/share_step_conf": 0.11562406271696091,
"num_tokens": 3308514.0,
"reward": 0.8109242916107178,
"reward_std": 0.2076244205236435,
"rewards/accuracy_reward_step": 0.515625,
"rewards/asymmetric_l2_reward": 0.7517649531364441,
"rewards/final_brier_reward_step": 0.5739898681640625,
"rewards/format_reward_step": 0.96484375,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.735247015953064,
"adv/mean_abs_reasoning": 0.41596412658691406,
"adv/mean_abs_step_conf": 0.7716025710105896,
"adv/ratio_final_to_reasoning": 1.7675731366210423,
"adv/ratio_step_to_reasoning": 1.854973834743334,
"adv/std_final_conf": 0.9187384843826294,
"adv/std_reasoning": 0.7012813687324524,
"adv/std_step_conf": 0.9337195158004761,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.49031918842014094,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.3992156862745099,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.8901960784313725,
"calib/gap": -0.0005752814549054852,
"calib/mean_conf": 0.9364705882352942,
"calib/mu_c": 0.9362043795620437,
"calib/mu_w": 0.9367796610169492,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3992156862745099,
"calib/std_conf": 0.030718425805495036,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6808519553072626,
"calib/step_q_c_n": 716.0,
"calib/step_q_gap": 0.008267685644341216,
"calib/step_q_w": 0.6725842696629214,
"calib/step_q_w_n": 623.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1352.0,
"completions/max_terminated_length": 1352.0,
"completions/mean_length": 473.62890625,
"completions/mean_terminated_length": 475.4862976074219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.016,
"grad_norm": 0.03617151826620102,
"kl": 0.010650634765625,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0205,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.034178197383880615,
"mask/share_reasoning": 0.8413941264152527,
"mask/share_step_conf": 0.1205214262008667,
"num_tokens": 3537643.0,
"reward": 0.840351939201355,
"reward_std": 0.1773625612258911,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/asymmetric_l2_reward": 0.7859764099121094,
"rewards/final_brier_reward_step": 0.588477373123169,
"rewards/format_reward_step": 0.99609375,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.7502319812774658,
"adv/mean_abs_reasoning": 0.40567100048065186,
"adv/mean_abs_step_conf": 0.7679750919342041,
"adv/ratio_final_to_reasoning": 1.849360640490859,
"adv/ratio_step_to_reasoning": 1.8930983260432293,
"adv/std_final_conf": 0.9114437103271484,
"adv/std_reasoning": 0.6816147565841675,
"adv/std_step_conf": 0.9336596727371216,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5097402597402597,
"calib/avg_num_step_conf": 6.546875,
"calib/ece": 0.3347200000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.932,
"calib/gap": 0.0009997294372294796,
"calib/mean_conf": 0.95072,
"calib/mu_c": 0.9511038961038961,
"calib/mu_w": 0.9501041666666666,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3347200000000001,
"calib/std_conf": 0.027761152713819345,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6192934249263984,
"calib/step_q_c_n": 1019.0,
"calib/step_q_gap": 0.029019452323658768,
"calib/step_q_w": 0.5902739726027396,
"calib/step_q_w_n": 657.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2684.0,
"completions/max_terminated_length": 2684.0,
"completions/mean_length": 663.50390625,
"completions/mean_terminated_length": 668.7283325195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.03823031485080719,
"kl": 0.010951042175292969,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0136,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.02500973641872406,
"mask/share_reasoning": 0.857978880405426,
"mask/share_step_conf": 0.10919886827468872,
"num_tokens": 3816348.0,
"reward": 0.879867672920227,
"reward_std": 0.17731714248657227,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/asymmetric_l2_reward": 0.8082501888275146,
"rewards/final_brier_reward_step": 0.6358601450920105,
"rewards/format_reward_step": 0.9765625,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.749430775642395,
"adv/mean_abs_reasoning": 0.4871532618999481,
"adv/mean_abs_step_conf": 0.7657231688499451,
"adv/ratio_final_to_reasoning": 1.538388089036985,
"adv/ratio_step_to_reasoning": 1.5718321701543072,
"adv/std_final_conf": 0.9169648289680481,
"adv/std_reasoning": 0.7576295733451843,
"adv/std_step_conf": 0.9339720606803894,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5530997098793708,
"calib/avg_num_step_conf": 5.8125,
"calib/ece": 0.2446215139442231,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9243027888446215,
"calib/gap": 0.0062330126736906966,
"calib/mean_conf": 0.9498007968127489,
"calib/mu_c": 0.951638418079096,
"calib/mu_w": 0.9454054054054053,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.2446215139442231,
"calib/std_conf": 0.025459133712774463,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.6141885325558795,
"calib/step_q_c_n": 1029.0,
"calib/step_q_gap": 0.04153058048616265,
"calib/step_q_w": 0.5726579520697168,
"calib/step_q_w_n": 459.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2605.0,
"completions/max_terminated_length": 2605.0,
"completions/mean_length": 507.30078125,
"completions/mean_terminated_length": 513.3162231445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.03812658414244652,
"kl": 0.017798423767089844,
"learning_rate": 4.25e-06,
"loss": -0.0293,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.032309580594301224,
"mask/share_reasoning": 0.8335152864456177,
"mask/share_step_conf": 0.12245635688304901,
"num_tokens": 4049745.0,
"reward": 0.9368460774421692,
"reward_std": 0.2139730602502823,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/asymmetric_l2_reward": 0.8321975469589233,
"rewards/final_brier_reward_step": 0.7102445363998413,
"rewards/format_reward_step": 0.96484375,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.7235511541366577,
"adv/mean_abs_reasoning": 0.4305734634399414,
"adv/mean_abs_step_conf": 0.7692475318908691,
"adv/ratio_final_to_reasoning": 1.680436012837522,
"adv/ratio_step_to_reasoning": 1.7865651211878917,
"adv/std_final_conf": 0.9193499088287354,
"adv/std_reasoning": 0.720542848110199,
"adv/std_step_conf": 0.9344537258148193,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.46025487350955774,
"calib/avg_num_step_conf": 5.11328125,
"calib/ece": 0.4365476190476191,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9563492063492064,
"calib/gap": -0.00011040312914001316,
"calib/mean_conf": 0.9563888888888888,
"calib/mu_c": 0.9563358778625954,
"calib/mu_w": 0.9564462809917355,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.4365476190476191,
"calib/std_conf": 0.029413168474644355,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5846951219512195,
"calib/step_q_c_n": 656.0,
"calib/step_q_gap": 0.005368935121204288,
"calib/step_q_w": 0.5793261868300152,
"calib/step_q_w_n": 653.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2904.0,
"completions/max_terminated_length": 2904.0,
"completions/mean_length": 508.76953125,
"completions/mean_terminated_length": 512.7755737304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.0192,
"grad_norm": 0.04069630801677704,
"kl": 0.019733428955078125,
"learning_rate": 4.5e-06,
"loss": -0.0352,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03226814791560173,
"mask/share_reasoning": 0.8506003618240356,
"mask/share_step_conf": 0.10931900143623352,
"num_tokens": 4290710.0,
"reward": 0.8182658553123474,
"reward_std": 0.1954856812953949,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/asymmetric_l2_reward": 0.7965109348297119,
"rewards/final_brier_reward_step": 0.5423644781112671,
"rewards/format_reward_step": 0.9765625,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.7590962648391724,
"adv/mean_abs_reasoning": 0.3977474570274353,
"adv/mean_abs_step_conf": 0.7660222053527832,
"adv/ratio_final_to_reasoning": 1.9084880404070375,
"adv/ratio_step_to_reasoning": 1.925900950008954,
"adv/std_final_conf": 0.9002686142921448,
"adv/std_reasoning": 0.66129070520401,
"adv/std_step_conf": 0.9337433576583862,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5557777777777777,
"calib/avg_num_step_conf": 4.80078125,
"calib/ece": 0.3707843137254901,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9725490196078431,
"calib/gap": 0.007238095238095044,
"calib/mean_conf": 0.9590196078431373,
"calib/mu_c": 0.9619999999999999,
"calib/mu_w": 0.9547619047619048,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3707843137254901,
"calib/std_conf": 0.027911267101634094,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5466435506241332,
"calib/step_q_c_n": 721.0,
"calib/step_q_gap": 0.025147487632007248,
"calib/step_q_w": 0.5214960629921259,
"calib/step_q_w_n": 508.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1758.0,
"completions/max_terminated_length": 1758.0,
"completions/mean_length": 484.01953125,
"completions/mean_terminated_length": 485.91766357421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.024780066683888435,
"kl": 0.02925872802734375,
"learning_rate": 4.75e-06,
"loss": -0.0072,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03218923509120941,
"mask/share_reasoning": 0.8538081645965576,
"mask/share_step_conf": 0.11009633541107178,
"num_tokens": 4519379.0,
"reward": 0.8946607112884521,
"reward_std": 0.16607698798179626,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/asymmetric_l2_reward": 0.8523170948028564,
"rewards/final_brier_reward_step": 0.6205980777740479,
"rewards/format_reward_step": 0.99609375,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.7314550876617432,
"adv/mean_abs_reasoning": 0.5331639051437378,
"adv/mean_abs_step_conf": 0.7298775911331177,
"adv/ratio_final_to_reasoning": 1.3719141161000898,
"adv/ratio_step_to_reasoning": 1.3689553701808586,
"adv/std_final_conf": 0.9152284860610962,
"adv/std_reasoning": 0.7927942276000977,
"adv/std_step_conf": 0.9345114827156067,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4764349489795918,
"calib/avg_num_step_conf": 5.6015625,
"calib/ece": 0.41123015873015883,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9761904761904762,
"calib/gap": -0.0030535714285715443,
"calib/mean_conf": 0.964642857142857,
"calib/mu_c": 0.9632857142857142,
"calib/mu_w": 0.9663392857142857,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4101587301587303,
"calib/std_conf": 0.02658229637795846,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5052864583333333,
"calib/step_q_c_n": 768.0,
"calib/step_q_gap": 0.01378495683183173,
"calib/step_q_w": 0.49150150150150157,
"calib/step_q_w_n": 666.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2446.0,
"completions/max_terminated_length": 2446.0,
"completions/mean_length": 500.95703125,
"completions/mean_terminated_length": 504.9015808105469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.025546662509441376,
"kl": 0.041919708251953125,
"learning_rate": 5e-06,
"loss": -0.0376,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.0341811366379261,
"mask/share_reasoning": 0.8281325697898865,
"mask/share_step_conf": 0.12987381219863892,
"num_tokens": 4752496.0,
"reward": 0.8547559976577759,
"reward_std": 0.21240723133087158,
"rewards/accuracy_reward_step": 0.546875,
"rewards/asymmetric_l2_reward": 0.8308833837509155,
"rewards/final_brier_reward_step": 0.5739409923553467,
"rewards/format_reward_step": 0.9765625,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.7029703855514526,
"adv/mean_abs_reasoning": 0.4354853928089142,
"adv/mean_abs_step_conf": 0.7637710571289062,
"adv/ratio_final_to_reasoning": 1.6142226516881306,
"adv/ratio_step_to_reasoning": 1.7538385207423937,
"adv/std_final_conf": 0.8815454840660095,
"adv/std_reasoning": 0.7205679416656494,
"adv/std_step_conf": 0.9338224530220032,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.47911710282844305,
"calib/avg_num_step_conf": 6.03125,
"calib/ece": 0.3549011857707511,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9802371541501976,
"calib/gap": -0.002429949775310858,
"calib/mean_conf": 0.9698418972332016,
"calib/mu_c": 0.9689102564102563,
"calib/mu_w": 0.9713402061855672,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.35407114624505936,
"calib/std_conf": 0.022649911297309762,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4717988826815642,
"calib/step_q_c_n": 895.0,
"calib/step_q_gap": 0.04224572397586318,
"calib/step_q_w": 0.42955315870570104,
"calib/step_q_w_n": 649.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2070.0,
"completions/max_terminated_length": 2070.0,
"completions/mean_length": 512.80078125,
"completions/mean_terminated_length": 514.811767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.0224,
"grad_norm": 0.022558465600013733,
"kl": 0.045940399169921875,
"learning_rate": 4.9722222222222224e-06,
"loss": 0.0118,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.0322754830121994,
"mask/share_reasoning": 0.8363606333732605,
"mask/share_step_conf": 0.1274576187133789,
"num_tokens": 4986733.0,
"reward": 0.9055733680725098,
"reward_std": 0.18177592754364014,
"rewards/accuracy_reward_step": 0.609375,
"rewards/asymmetric_l2_reward": 0.8619275093078613,
"rewards/final_brier_reward_step": 0.6296879053115845,
"rewards/format_reward_step": 0.98828125,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.687179446220398,
"adv/mean_abs_reasoning": 0.3393262028694153,
"adv/mean_abs_step_conf": 0.7603123188018799,
"adv/ratio_final_to_reasoning": 2.0251293310373937,
"adv/ratio_step_to_reasoning": 2.2406531307412028,
"adv/std_final_conf": 0.8886227607727051,
"adv/std_reasoning": 0.6402140855789185,
"adv/std_step_conf": 0.934212863445282,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.497718760640109,
"calib/avg_num_step_conf": 6.17578125,
"calib/ece": 0.3266929133858268,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.984251968503937,
"calib/gap": -0.0016397684712292637,
"calib/mean_conf": 0.9719685039370078,
"calib/mu_c": 0.9713939393939393,
"calib/mu_w": 0.9730337078651685,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32452755905511815,
"calib/std_conf": 0.024172587131458173,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4599161616161616,
"calib/step_q_c_n": 990.0,
"calib/step_q_gap": -0.014872332461672633,
"calib/step_q_w": 0.47478849407783424,
"calib/step_q_w_n": 591.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2402.0,
"completions/max_terminated_length": 2402.0,
"completions/mean_length": 505.25,
"completions/mean_terminated_length": 507.2314147949219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.024227218702435493,
"kl": 0.056705474853515625,
"learning_rate": 4.944444444444445e-06,
"loss": 0.0027,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.031584907323122025,
"mask/share_reasoning": 0.8346095681190491,
"mask/share_step_conf": 0.1298992782831192,
"num_tokens": 5217893.0,
"reward": 0.9237887859344482,
"reward_std": 0.15195703506469727,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/asymmetric_l2_reward": 0.8583118915557861,
"rewards/final_brier_reward_step": 0.6619218587875366,
"rewards/format_reward_step": 0.9921875,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.7530167102813721,
"adv/mean_abs_reasoning": 0.4509432315826416,
"adv/mean_abs_step_conf": 0.74357008934021,
"adv/ratio_final_to_reasoning": 1.6698703019414791,
"adv/ratio_step_to_reasoning": 1.6489217206577373,
"adv/std_final_conf": 0.897098183631897,
"adv/std_reasoning": 0.7014564871788025,
"adv/std_step_conf": 0.9347206950187683,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4533584431889517,
"calib/avg_num_step_conf": 5.5859375,
"calib/ece": 0.4400395256916997,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9920948616600791,
"calib/gap": -0.0006911487758944901,
"calib/mean_conf": 0.9727667984189724,
"calib/mu_c": 0.9724444444444442,
"calib/mu_w": 0.9731355932203387,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.439604743083004,
"calib/std_conf": 0.02336031913958452,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.47111850865512644,
"calib/step_q_c_n": 751.0,
"calib/step_q_gap": 0.015625136048351806,
"calib/step_q_w": 0.45549337260677464,
"calib/step_q_w_n": 679.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2113.0,
"completions/max_terminated_length": 2113.0,
"completions/mean_length": 516.28125,
"completions/mean_terminated_length": 520.346435546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.030233683064579964,
"kl": 0.048114776611328125,
"learning_rate": 4.9166666666666665e-06,
"loss": -0.0494,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03315791115164757,
"mask/share_reasoning": 0.8343464136123657,
"mask/share_step_conf": 0.1246831864118576,
"num_tokens": 5453997.0,
"reward": 0.8372361660003662,
"reward_std": 0.20087505877017975,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/asymmetric_l2_reward": 0.8298656940460205,
"rewards/final_brier_reward_step": 0.5430440902709961,
"rewards/format_reward_step": 0.98046875,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.7376492023468018,
"adv/mean_abs_reasoning": 0.5325069427490234,
"adv/mean_abs_step_conf": 0.7742867469787598,
"adv/ratio_final_to_reasoning": 1.3852386572440694,
"adv/ratio_step_to_reasoning": 1.4540406609190255,
"adv/std_final_conf": 0.9101276993751526,
"adv/std_reasoning": 0.7754067778587341,
"adv/std_step_conf": 0.9344092011451721,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6002886002886003,
"calib/avg_num_step_conf": 6.46484375,
"calib/ece": 0.46437246963562745,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9919028340080972,
"calib/gap": 0.006217368490095798,
"calib/mean_conf": 0.974493927125506,
"calib/mu_c": 0.9775396825396826,
"calib/mu_w": 0.9713223140495868,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.46437246963562745,
"calib/std_conf": 0.018606487581600245,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.43772254335260113,
"calib/step_q_c_n": 865.0,
"calib/step_q_gap": -0.00044201360942414114,
"calib/step_q_w": 0.4381645569620253,
"calib/step_q_w_n": 790.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3053.0,
"completions/max_terminated_length": 3053.0,
"completions/mean_length": 605.53125,
"completions/mean_terminated_length": 605.53125,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.0256,
"grad_norm": 0.7543313503265381,
"kl": 1.2979049682617188,
"learning_rate": 4.888888888888889e-06,
"loss": 0.1109,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.0299096517264843,
"mask/share_reasoning": 0.8437927961349487,
"mask/share_step_conf": 0.12629754841327667,
"num_tokens": 5713525.0,
"reward": 0.8208262324333191,
"reward_std": 0.2088262289762497,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/asymmetric_l2_reward": 0.831911027431488,
"rewards/final_brier_reward_step": 0.5183351635932922,
"rewards/format_reward_step": 0.96484375,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.7134343385696411,
"adv/mean_abs_reasoning": 0.4166892170906067,
"adv/mean_abs_step_conf": 0.7519113421440125,
"adv/ratio_final_to_reasoning": 1.7121497492806705,
"adv/ratio_step_to_reasoning": 1.8044895603346358,
"adv/std_final_conf": 0.8936386108398438,
"adv/std_reasoning": 0.7013903260231018,
"adv/std_step_conf": 0.934689462184906,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5044484580908423,
"calib/avg_num_step_conf": 6.16796875,
"calib/ece": 0.36948000000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.988,
"calib/gap": 0.0007445314067830999,
"calib/mean_conf": 0.97348,
"calib/mu_c": 0.973774834437086,
"calib/mu_w": 0.9730303030303029,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.36948000000000003,
"calib/std_conf": 0.022169564722835676,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4669714285714286,
"calib/step_q_c_n": 875.0,
"calib/step_q_gap": 0.04030949675324674,
"calib/step_q_w": 0.42666193181818185,
"calib/step_q_w_n": 704.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2316.0,
"completions/max_terminated_length": 2316.0,
"completions/mean_length": 517.859375,
"completions/mean_terminated_length": 517.859375,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.02401627041399479,
"kl": 0.04943084716796875,
"learning_rate": 4.861111111111111e-06,
"loss": 0.0918,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03197113424539566,
"mask/share_reasoning": 0.8365023136138916,
"mask/share_step_conf": 0.13152649998664856,
"num_tokens": 5949321.0,
"reward": 0.8754016160964966,
"reward_std": 0.18805429339408875,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/asymmetric_l2_reward": 0.8299168944358826,
"rewards/final_brier_reward_step": 0.6083863377571106,
"rewards/format_reward_step": 0.97265625,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.7160738706588745,
"adv/mean_abs_reasoning": 0.406027227640152,
"adv/mean_abs_step_conf": 0.7333802580833435,
"adv/ratio_final_to_reasoning": 1.7636104721861319,
"adv/ratio_step_to_reasoning": 1.806234183716648,
"adv/std_final_conf": 0.8886821269989014,
"adv/std_reasoning": 0.7012869715690613,
"adv/std_step_conf": 0.9339894652366638,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.487475257661593,
"calib/avg_num_step_conf": 5.6484375,
"calib/ece": 0.33404761904761915,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0007357859531772482,
"calib/mean_conf": 0.972936507936508,
"calib/mu_c": 0.9726708074534163,
"calib/mu_w": 0.9734065934065935,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.33404761904761915,
"calib/std_conf": 0.01628454620025839,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.47772833723653396,
"calib/step_q_c_n": 854.0,
"calib/step_q_gap": 0.0363432021013988,
"calib/step_q_w": 0.44138513513513516,
"calib/step_q_w_n": 592.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2908.0,
"completions/max_terminated_length": 2908.0,
"completions/mean_length": 528.06640625,
"completions/mean_terminated_length": 528.06640625,
"completions/min_length": 202.0,
"completions/min_terminated_length": 202.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.02128530666232109,
"kl": 0.05477142333984375,
"learning_rate": 4.833333333333333e-06,
"loss": 0.0559,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.029835352674126625,
"mask/share_reasoning": 0.8532722592353821,
"mask/share_step_conf": 0.11689238250255585,
"num_tokens": 6189746.0,
"reward": 0.9097167253494263,
"reward_std": 0.16467860341072083,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/asymmetric_l2_reward": 0.8499466180801392,
"rewards/final_brier_reward_step": 0.6468304395675659,
"rewards/format_reward_step": 0.984375,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.7680479288101196,
"adv/mean_abs_reasoning": 0.5606542825698853,
"adv/mean_abs_step_conf": 0.7543854713439941,
"adv/ratio_final_to_reasoning": 1.3699136039585729,
"adv/ratio_step_to_reasoning": 1.3455448300262656,
"adv/std_final_conf": 0.9101540446281433,
"adv/std_reasoning": 0.7928864359855652,
"adv/std_step_conf": 0.9347414374351501,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4851177112066201,
"calib/avg_num_step_conf": 6.8125,
"calib/ece": 0.44495999999999986,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.976,
"calib/gap": 0.003723138110205859,
"calib/mean_conf": 0.96464,
"calib/mu_c": 0.9664122137404579,
"calib/mu_w": 0.9626890756302521,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.44279999999999986,
"calib/std_conf": 0.06608532666182411,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.49636254501800725,
"calib/step_q_c_n": 833.0,
"calib/step_q_gap": 0.0435634231738799,
"calib/step_q_w": 0.45279912184412735,
"calib/step_q_w_n": 911.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3047.0,
"completions/max_terminated_length": 3047.0,
"completions/mean_length": 556.5,
"completions/mean_terminated_length": 558.682373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.0288,
"grad_norm": 0.023293569684028625,
"kl": 0.046802520751953125,
"learning_rate": 4.805555555555556e-06,
"loss": 0.0577,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.030663229525089264,
"mask/share_reasoning": 0.8329758644104004,
"mask/share_step_conf": 0.13245464861392975,
"num_tokens": 6437426.0,
"reward": 0.8341401219367981,
"reward_std": 0.22353267669677734,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/asymmetric_l2_reward": 0.829703688621521,
"rewards/final_brier_reward_step": 0.5409202575683594,
"rewards/format_reward_step": 0.9765625,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.7091407775878906,
"adv/mean_abs_reasoning": 0.362392783164978,
"adv/mean_abs_step_conf": 0.7690234184265137,
"adv/ratio_final_to_reasoning": 1.9568291934363848,
"adv/ratio_step_to_reasoning": 2.122071559235269,
"adv/std_final_conf": 0.8629666566848755,
"adv/std_reasoning": 0.6403860449790955,
"adv/std_step_conf": 0.9344640970230103,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5548206937095825,
"calib/avg_num_step_conf": 5.734375,
"calib/ece": 0.29453815261044186,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9718875502008032,
"calib/gap": 0.010019841269841345,
"calib/mean_conf": 0.9612048192771084,
"calib/mu_c": 0.9644642857142858,
"calib/mu_w": 0.9544444444444444,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.29052208835341375,
"calib/std_conf": 0.08898893379640163,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4770860215053763,
"calib/step_q_c_n": 930.0,
"calib/step_q_gap": 0.05139828916336886,
"calib/step_q_w": 0.42568773234200746,
"calib/step_q_w_n": 538.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2501.0,
"completions/max_terminated_length": 2501.0,
"completions/mean_length": 558.4375,
"completions/mean_terminated_length": 558.4375,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.02454567328095436,
"kl": 0.04253387451171875,
"learning_rate": 4.777777777777778e-06,
"loss": 0.0252,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.029856320470571518,
"mask/share_reasoning": 0.8533777594566345,
"mask/share_step_conf": 0.11676593124866486,
"num_tokens": 6687330.0,
"reward": 0.9220882058143616,
"reward_std": 0.1710459291934967,
"rewards/accuracy_reward_step": 0.65625,
"rewards/asymmetric_l2_reward": 0.842483401298523,
"rewards/final_brier_reward_step": 0.6759117245674133,
"rewards/format_reward_step": 0.97265625,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.751983642578125,
"adv/mean_abs_reasoning": 0.4891800880432129,
"adv/mean_abs_step_conf": 0.755209743976593,
"adv/ratio_final_to_reasoning": 1.5372327307641696,
"adv/ratio_step_to_reasoning": 1.543827646373619,
"adv/std_final_conf": 0.9133455753326416,
"adv/std_reasoning": 0.7393599152565002,
"adv/std_step_conf": 0.9341092705726624,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4830664633371458,
"calib/avg_num_step_conf": 6.45703125,
"calib/ece": 0.48569721115537856,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9920318725099602,
"calib/gap": -0.007523827678230899,
"calib/mean_conf": 0.9637848605577689,
"calib/mu_c": 0.9599180327868853,
"calib/mu_w": 0.9674418604651162,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.48171314741035864,
"calib/std_conf": 0.06420535972185211,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.45259562841530054,
"calib/step_q_c_n": 732.0,
"calib/step_q_gap": 0.031010394973389588,
"calib/step_q_w": 0.42158523344191096,
"calib/step_q_w_n": 921.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2514.0,
"completions/max_terminated_length": 2514.0,
"completions/mean_length": 585.25,
"completions/mean_terminated_length": 589.8582763671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.029458940029144287,
"kl": 0.04529571533203125,
"learning_rate": 4.75e-06,
"loss": -0.0735,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.027916226536035538,
"mask/share_reasoning": 0.8425346612930298,
"mask/share_step_conf": 0.12173663079738617,
"num_tokens": 6944282.0,
"reward": 0.8238710165023804,
"reward_std": 0.20323438942432404,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/asymmetric_l2_reward": 0.8522884845733643,
"rewards/final_brier_reward_step": 0.5040472745895386,
"rewards/format_reward_step": 0.98046875,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.7376535534858704,
"adv/mean_abs_reasoning": 0.4982551336288452,
"adv/mean_abs_step_conf": 0.7502148747444153,
"adv/ratio_final_to_reasoning": 1.4804735640423028,
"adv/ratio_step_to_reasoning": 1.5056841848884133,
"adv/std_final_conf": 0.9088844656944275,
"adv/std_reasoning": 0.7576410174369812,
"adv/std_step_conf": 0.9344900250434875,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.47781114447781114,
"calib/avg_num_step_conf": 6.62890625,
"calib/ece": 0.40760162601626015,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.9715447154471545,
"calib/gap": 0.012942942942942848,
"calib/mean_conf": 0.9563821138211382,
"calib/mu_c": 0.9622222222222221,
"calib/mu_w": 0.9492792792792792,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.40760162601626015,
"calib/std_conf": 0.0807153001610265,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42158899188876015,
"calib/step_q_c_n": 863.0,
"calib/step_q_gap": 0.014454699322812903,
"calib/step_q_w": 0.40713429256594724,
"calib/step_q_w_n": 834.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2535.0,
"completions/max_terminated_length": 2535.0,
"completions/mean_length": 626.84765625,
"completions/mean_terminated_length": 631.783447265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.032,
"grad_norm": 0.02535308338701725,
"kl": 0.0458221435546875,
"learning_rate": 4.722222222222222e-06,
"loss": 0.0568,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.026801906526088715,
"mask/share_reasoning": 0.8470184803009033,
"mask/share_step_conf": 0.11836712062358856,
"num_tokens": 7211739.0,
"reward": 0.844096839427948,
"reward_std": 0.2074047029018402,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/asymmetric_l2_reward": 0.8272979259490967,
"rewards/final_brier_reward_step": 0.5632394552230835,
"rewards/format_reward_step": 0.9609375,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.7183291912078857,
"adv/mean_abs_reasoning": 0.42496633529663086,
"adv/mean_abs_step_conf": 0.7648087739944458,
"adv/ratio_final_to_reasoning": 1.690320224321968,
"adv/ratio_step_to_reasoning": 1.799692612029142,
"adv/std_final_conf": 0.9244080185890198,
"adv/std_reasoning": 0.7204815149307251,
"adv/std_step_conf": 0.9345629215240479,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4812748493010132,
"calib/avg_num_step_conf": 6.91015625,
"calib/ece": 0.5126693227091633,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.952191235059761,
"calib/gap": -0.007899833269205736,
"calib/mean_conf": 0.9559362549800797,
"calib/mu_c": 0.9515929203539825,
"calib/mu_w": 0.9594927536231882,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.509203187250996,
"calib/std_conf": 0.06133491690864156,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4562691131498471,
"calib/step_q_c_n": 654.0,
"calib/step_q_gap": 0.06567853019020581,
"calib/step_q_w": 0.3905905829596413,
"calib/step_q_w_n": 1115.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2377.0,
"completions/max_terminated_length": 2377.0,
"completions/mean_length": 613.2578125,
"completions/mean_terminated_length": 618.0866088867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.030099138617515564,
"kl": 0.04169464111328125,
"learning_rate": 4.694444444444445e-06,
"loss": 0.0082,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.028120990842580795,
"mask/share_reasoning": 0.8433920741081238,
"mask/share_step_conf": 0.12067442387342453,
"num_tokens": 7474645.0,
"reward": 0.8082037568092346,
"reward_std": 0.17442850768566132,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/asymmetric_l2_reward": 0.8525465726852417,
"rewards/final_brier_reward_step": 0.4794859290122986,
"rewards/format_reward_step": 0.98046875,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.72445148229599,
"adv/mean_abs_reasoning": 0.49374479055404663,
"adv/mean_abs_step_conf": 0.7507187128067017,
"adv/ratio_final_to_reasoning": 1.4672589891694048,
"adv/ratio_step_to_reasoning": 1.5204590046697941,
"adv/std_final_conf": 0.9220708012580872,
"adv/std_reasoning": 0.7753057479858398,
"adv/std_step_conf": 0.9348052144050598,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6214615735016595,
"calib/avg_num_step_conf": 6.03125,
"calib/ece": 0.43745967741935476,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9032258064516129,
"calib/gap": 0.02743476280340973,
"calib/mean_conf": 0.9383467741935484,
"calib/mu_c": 0.9517322834645667,
"calib/mu_w": 0.924297520661157,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4318548387096774,
"calib/std_conf": 0.11818899192469161,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4153371592539455,
"calib/step_q_c_n": 697.0,
"calib/step_q_gap": 0.04276336940742842,
"calib/step_q_w": 0.37257378984651707,
"calib/step_q_w_n": 847.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2412.0,
"completions/max_terminated_length": 2412.0,
"completions/mean_length": 592.4609375,
"completions/mean_terminated_length": 594.7843627929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.023633981123566628,
"kl": 0.0494842529296875,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0108,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.029128411784768105,
"mask/share_reasoning": 0.8510886430740356,
"mask/share_step_conf": 0.1158766970038414,
"num_tokens": 7733019.0,
"reward": 0.8346266150474548,
"reward_std": 0.2019689679145813,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/asymmetric_l2_reward": 0.8270046710968018,
"rewards/final_brier_reward_step": 0.5500609874725342,
"rewards/format_reward_step": 0.96484375,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.7848911285400391,
"adv/mean_abs_reasoning": 0.4544700086116791,
"adv/mean_abs_step_conf": 0.7658688426017761,
"adv/ratio_final_to_reasoning": 1.7270471398932017,
"adv/ratio_step_to_reasoning": 1.6851911635299375,
"adv/std_final_conf": 0.9196609258651733,
"adv/std_reasoning": 0.7013769149780273,
"adv/std_step_conf": 0.9336271286010742,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5360434596838186,
"calib/avg_num_step_conf": 6.3125,
"calib/ece": 0.4496875000000001,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.91015625,
"calib/gap": 0.008684612097906275,
"calib/mean_conf": 0.94578125,
"calib/mu_c": 0.9501574803149605,
"calib/mu_w": 0.9414728682170542,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4496875000000001,
"calib/std_conf": 0.061193971504041954,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3878806333739343,
"calib/step_q_c_n": 821.0,
"calib/step_q_gap": 0.009132834631795983,
"calib/step_q_w": 0.3787477987421383,
"calib/step_q_w_n": 795.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1359.0,
"completions/max_terminated_length": 1359.0,
"completions/mean_length": 535.2109375,
"completions/mean_terminated_length": 537.309814453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.0352,
"grad_norm": 0.021006744354963303,
"kl": 0.052581787109375,
"learning_rate": 4.638888888888889e-06,
"loss": 0.0037,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03007342480123043,
"mask/share_reasoning": 0.8435391187667847,
"mask/share_step_conf": 0.12248119711875916,
"num_tokens": 7976905.0,
"reward": 0.8592232465744019,
"reward_std": 0.16713036596775055,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/asymmetric_l2_reward": 0.8708338141441345,
"rewards/final_brier_reward_step": 0.548393726348877,
"rewards/format_reward_step": 1.0,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.7719642519950867,
"adv/mean_abs_reasoning": 0.6268492937088013,
"adv/mean_abs_step_conf": 0.7533676624298096,
"adv/ratio_final_to_reasoning": 1.2314989579516022,
"adv/ratio_step_to_reasoning": 1.2018321947408648,
"adv/std_final_conf": 0.9304158091545105,
"adv/std_reasoning": 0.8266779780387878,
"adv/std_step_conf": 0.934239387512207,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5234611666129553,
"calib/avg_num_step_conf": 6.45703125,
"calib/ece": 0.3810714285714285,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9087301587301587,
"calib/gap": -0.0008849500483399941,
"calib/mean_conf": 0.9373412698412698,
"calib/mu_c": 0.9369655172413794,
"calib/mu_w": 0.9378504672897194,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.37150793650793645,
"calib/std_conf": 0.08487429198324628,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.34746463547334067,
"calib/step_q_c_n": 919.0,
"calib/step_q_gap": 0.025979621849362422,
"calib/step_q_w": 0.32148501362397824,
"calib/step_q_w_n": 734.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2888.0,
"completions/max_terminated_length": 2888.0,
"completions/mean_length": 521.78515625,
"completions/mean_terminated_length": 521.78515625,
"completions/min_length": 206.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.021358368918299675,
"kl": 0.061359405517578125,
"learning_rate": 4.611111111111112e-06,
"loss": 0.0417,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03217000514268875,
"mask/share_reasoning": 0.8295138478279114,
"mask/share_step_conf": 0.13831612467765808,
"num_tokens": 8215594.0,
"reward": 0.8955637216567993,
"reward_std": 0.2316317856311798,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/asymmetric_l2_reward": 0.8735677003860474,
"rewards/final_brier_reward_step": 0.6074035167694092,
"rewards/format_reward_step": 0.984375,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.7504846453666687,
"adv/mean_abs_reasoning": 0.4689059257507324,
"adv/mean_abs_step_conf": 0.7678923606872559,
"adv/ratio_final_to_reasoning": 1.6005015167277323,
"adv/ratio_step_to_reasoning": 1.6376256270548026,
"adv/std_final_conf": 0.9261394143104553,
"adv/std_reasoning": 0.7205601334571838,
"adv/std_step_conf": 0.9343000054359436,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.593344965104686,
"calib/avg_num_step_conf": 5.63671875,
"calib/ece": 0.3975590551181102,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.8188976377952756,
"calib/gap": 0.032642073778663905,
"calib/mean_conf": 0.9130708661417323,
"calib/mu_c": 0.9282352941176469,
"calib/mu_w": 0.895593220338983,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.38759842519685034,
"calib/std_conf": 0.1352309882195658,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3523180592991914,
"calib/step_q_c_n": 742.0,
"calib/step_q_gap": 0.02979309496252952,
"calib/step_q_w": 0.3225249643366619,
"calib/step_q_w_n": 701.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2469.0,
"completions/max_terminated_length": 2469.0,
"completions/mean_length": 582.58203125,
"completions/mean_terminated_length": 582.58203125,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.022607261314988136,
"kl": 0.05496978759765625,
"learning_rate": 4.583333333333333e-06,
"loss": -0.0247,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02826719731092453,
"mask/share_reasoning": 0.8655970096588135,
"mask/share_step_conf": 0.106135793030262,
"num_tokens": 8473991.0,
"reward": 0.8837124705314636,
"reward_std": 0.18118566274642944,
"rewards/accuracy_reward_step": 0.53125,
"rewards/asymmetric_l2_reward": 0.8655635714530945,
"rewards/final_brier_reward_step": 0.5979551076889038,
"rewards/format_reward_step": 0.98828125,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.7104417085647583,
"adv/mean_abs_reasoning": 0.39619794487953186,
"adv/mean_abs_step_conf": 0.7653812170028687,
"adv/ratio_final_to_reasoning": 1.7931483939947632,
"adv/ratio_step_to_reasoning": 1.9318152123065424,
"adv/std_final_conf": 0.9201058745384216,
"adv/std_reasoning": 0.7012619376182556,
"adv/std_step_conf": 0.9333252906799316,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5465106897942719,
"calib/avg_num_step_conf": 6.12890625,
"calib/ece": 0.21273809523809523,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.8134920634920635,
"calib/gap": 0.009519160951996675,
"calib/mean_conf": 0.9189285714285714,
"calib/mu_c": 0.9214594594594593,
"calib/mu_w": 0.9119402985074626,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19876984126984126,
"calib/std_conf": 0.1101119688844495,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.3337616179001721,
"calib/step_q_c_n": 1162.0,
"calib/step_q_gap": 0.025334099472653693,
"calib/step_q_w": 0.3084275184275184,
"calib/step_q_w_n": 407.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2003.0,
"completions/max_terminated_length": 2003.0,
"completions/mean_length": 528.51171875,
"completions/mean_terminated_length": 528.51171875,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.0384,
"grad_norm": 0.046310946345329285,
"kl": 0.07181549072265625,
"learning_rate": 4.555555555555556e-06,
"loss": -0.0601,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03279054909944534,
"mask/share_reasoning": 0.8338136672973633,
"mask/share_step_conf": 0.13339582085609436,
"num_tokens": 8712002.0,
"reward": 0.9817196726799011,
"reward_std": 0.14960414171218872,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/asymmetric_l2_reward": 0.8716880679130554,
"rewards/final_brier_reward_step": 0.7503449320793152,
"rewards/format_reward_step": 0.984375,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.7335127592086792,
"adv/mean_abs_reasoning": 0.3861789107322693,
"adv/mean_abs_step_conf": 0.7399336099624634,
"adv/ratio_final_to_reasoning": 1.8994117462753166,
"adv/ratio_step_to_reasoning": 1.9160383682252542,
"adv/std_final_conf": 0.9107916951179504,
"adv/std_reasoning": 0.6816076636314392,
"adv/std_step_conf": 0.9339763522148132,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5764084980502892,
"calib/avg_num_step_conf": 6.046875,
"calib/ece": 0.47885714285714287,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.8857142857142857,
"calib/gap": 0.03537851284119908,
"calib/mean_conf": 0.9215510204081632,
"calib/mu_c": 0.9409009009009006,
"calib/mu_w": 0.9055223880597015,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.4736734693877551,
"calib/std_conf": 0.13022759687852142,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.3618576388888889,
"calib/step_q_c_n": 576.0,
"calib/step_q_gap": 0.07817451131687242,
"calib/step_q_w": 0.28368312757201647,
"calib/step_q_w_n": 972.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2500.0,
"completions/max_terminated_length": 2500.0,
"completions/mean_length": 562.7578125,
"completions/mean_terminated_length": 569.4308471679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.04367915168404579,
"kl": 0.06029510498046875,
"learning_rate": 4.527777777777778e-06,
"loss": -0.0503,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.029208239167928696,
"mask/share_reasoning": 0.842954158782959,
"mask/share_step_conf": 0.11611886322498322,
"num_tokens": 8963164.0,
"reward": 0.8178337812423706,
"reward_std": 0.1575174331665039,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/asymmetric_l2_reward": 0.8471627235412598,
"rewards/final_brier_reward_step": 0.5103796720504761,
"rewards/format_reward_step": 0.95703125,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.7438644766807556,
"adv/mean_abs_reasoning": 0.3514899015426636,
"adv/mean_abs_step_conf": 0.7278769612312317,
"adv/ratio_final_to_reasoning": 2.1163182026453353,
"adv/ratio_step_to_reasoning": 2.0708332103899223,
"adv/std_final_conf": 0.9110886454582214,
"adv/std_reasoning": 0.6403597593307495,
"adv/std_step_conf": 0.9339098930358887,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6162587412587412,
"calib/avg_num_step_conf": 6.0,
"calib/ece": 0.4017670682730922,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8192771084337349,
"calib/gap": 0.022062937062936938,
"calib/mean_conf": 0.9255421686746987,
"calib/mu_c": 0.9359090909090908,
"calib/mu_w": 0.9138461538461539,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39859437751004,
"calib/std_conf": 0.07661387585031114,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.33544607190412784,
"calib/step_q_c_n": 751.0,
"calib/step_q_gap": 0.037586199292662825,
"calib/step_q_w": 0.297859872611465,
"calib/step_q_w_n": 785.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2778.0,
"completions/max_terminated_length": 2778.0,
"completions/mean_length": 579.6796875,
"completions/mean_terminated_length": 581.9530029296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.021768247708678246,
"kl": 0.060333251953125,
"learning_rate": 4.5e-06,
"loss": 0.0435,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03031359612941742,
"mask/share_reasoning": 0.8476117849349976,
"mask/share_step_conf": 0.11816837638616562,
"num_tokens": 9218450.0,
"reward": 0.8697977066040039,
"reward_std": 0.14932399988174438,
"rewards/accuracy_reward_step": 0.515625,
"rewards/asymmetric_l2_reward": 0.8586658239364624,
"rewards/final_brier_reward_step": 0.5832734107971191,
"rewards/format_reward_step": 0.97265625,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.7321931719779968,
"adv/mean_abs_reasoning": 0.4579048156738281,
"adv/mean_abs_step_conf": 0.7451863884925842,
"adv/ratio_final_to_reasoning": 1.5990073633546322,
"adv/ratio_step_to_reasoning": 1.6273827288669327,
"adv/std_final_conf": 0.9224193692207336,
"adv/std_reasoning": 0.7205734252929688,
"adv/std_step_conf": 0.9335898756980896,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6688839615668883,
"calib/avg_num_step_conf": 6.234375,
"calib/ece": 0.38768627450980386,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7725490196078432,
"calib/gap": 0.05266075388026592,
"calib/mean_conf": 0.8949019607843137,
"calib/mu_c": 0.9203030303030303,
"calib/mu_w": 0.8676422764227644,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3824705882352941,
"calib/std_conf": 0.16491676509051476,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.3508021390374331,
"calib/step_q_c_n": 748.0,
"calib/step_q_gap": 0.03972902582988591,
"calib/step_q_w": 0.3110731132075472,
"calib/step_q_w_n": 848.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2893.0,
"completions/max_terminated_length": 2893.0,
"completions/mean_length": 546.03125,
"completions/mean_terminated_length": 546.03125,
"completions/min_length": 206.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.0416,
"grad_norm": 0.028995206579566002,
"kl": 0.0600128173828125,
"learning_rate": 4.472222222222223e-06,
"loss": -0.0347,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.030841421335935593,
"mask/share_reasoning": 0.8439503908157349,
"mask/share_step_conf": 0.12520815432071686,
"num_tokens": 9464322.0,
"reward": 0.8865332007408142,
"reward_std": 0.1797136813402176,
"rewards/accuracy_reward_step": 0.515625,
"rewards/asymmetric_l2_reward": 0.8740624785423279,
"rewards/final_brier_reward_step": 0.5990039110183716,
"rewards/format_reward_step": 0.984375,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.7255151867866516,
"adv/mean_abs_reasoning": 0.5067353844642639,
"adv/mean_abs_step_conf": 0.7520684003829956,
"adv/ratio_final_to_reasoning": 1.4317436852247616,
"adv/ratio_step_to_reasoning": 1.4841442366968418,
"adv/std_final_conf": 0.9328517913818359,
"adv/std_reasoning": 0.7926381826400757,
"adv/std_step_conf": 0.9337299466133118,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4924899446958271,
"calib/avg_num_step_conf": 5.52734375,
"calib/ece": 0.45719367588932813,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7075098814229249,
"calib/gap": -0.004004524886877858,
"calib/mean_conf": 0.9013833992094862,
"calib/mu_c": 0.8992307692307693,
"calib/mu_w": 0.9032352941176471,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4480632411067194,
"calib/std_conf": 0.12519101311172237,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.35236206896551725,
"calib/step_q_c_n": 580.0,
"calib/step_q_gap": 0.020086619863720845,
"calib/step_q_w": 0.3322754491017964,
"calib/step_q_w_n": 835.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2236.0,
"completions/max_terminated_length": 2236.0,
"completions/mean_length": 570.9453125,
"completions/mean_terminated_length": 573.184326171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.029202446341514587,
"kl": 0.06134796142578125,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0549,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.030164752155542374,
"mask/share_reasoning": 0.8536300659179688,
"mask/share_step_conf": 0.11229896545410156,
"num_tokens": 9717244.0,
"reward": 0.8443441390991211,
"reward_std": 0.1785648614168167,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/asymmetric_l2_reward": 0.8648823499679565,
"rewards/final_brier_reward_step": 0.5347433686256409,
"rewards/format_reward_step": 0.98828125,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.7701693773269653,
"adv/mean_abs_reasoning": 0.47483962774276733,
"adv/mean_abs_step_conf": 0.7666299343109131,
"adv/ratio_final_to_reasoning": 1.6219568299050762,
"adv/ratio_step_to_reasoning": 1.6145028542693913,
"adv/std_final_conf": 0.9186797142028809,
"adv/std_reasoning": 0.7206448316574097,
"adv/std_step_conf": 0.9336923360824585,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6092863894139887,
"calib/avg_num_step_conf": 5.66015625,
"calib/ece": 0.19758893280632403,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6719367588932806,
"calib/gap": 0.041159420289854975,
"calib/mean_conf": 0.8674703557312253,
"calib/mu_c": 0.8786956521739129,
"calib/mu_w": 0.8375362318840579,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.168893280632411,
"calib/std_conf": 0.1793265952873243,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.3456231599607458,
"calib/step_q_c_n": 1019.0,
"calib/step_q_gap": 0.016971764611908524,
"calib/step_q_w": 0.32865139534883725,
"calib/step_q_w_n": 430.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2388.0,
"completions/max_terminated_length": 2388.0,
"completions/mean_length": 526.03515625,
"completions/mean_terminated_length": 526.03515625,
"completions/min_length": 192.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.03552259877324104,
"kl": 0.05785369873046875,
"learning_rate": 4.416666666666667e-06,
"loss": 0.0089,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03249321132898331,
"mask/share_reasoning": 0.8456205129623413,
"mask/share_step_conf": 0.121886245906353,
"num_tokens": 9959157.0,
"reward": 0.9845508337020874,
"reward_std": 0.17637822031974792,
"rewards/accuracy_reward_step": 0.71875,
"rewards/asymmetric_l2_reward": 0.8705066442489624,
"rewards/final_brier_reward_step": 0.7571886777877808,
"rewards/format_reward_step": 0.98828125,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.7168235778808594,
"adv/mean_abs_reasoning": 0.3787510395050049,
"adv/mean_abs_step_conf": 0.7346171736717224,
"adv/ratio_final_to_reasoning": 1.8925983115919267,
"adv/ratio_step_to_reasoning": 1.9395779735200305,
"adv/std_final_conf": 0.9166406393051147,
"adv/std_reasoning": 0.6814785599708557,
"adv/std_step_conf": 0.9328687787055969,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.630614352090862,
"calib/avg_num_step_conf": 6.328125,
"calib/ece": 0.3178656126482214,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6758893280632411,
"calib/gap": 0.05375709860609179,
"calib/mean_conf": 0.8887747035573121,
"calib/mu_c": 0.9108724832214763,
"calib/mu_w": 0.8571153846153845,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3088537549407115,
"calib/std_conf": 0.1539560041628695,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3585111111111111,
"calib/step_q_c_n": 900.0,
"calib/step_q_gap": 0.04060833333333336,
"calib/step_q_w": 0.31790277777777776,
"calib/step_q_w_n": 720.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2391.0,
"completions/max_terminated_length": 2391.0,
"completions/mean_length": 480.5859375,
"completions/mean_terminated_length": 482.4706115722656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 220.0,
"epoch": 0.0448,
"grad_norm": 0.02795771323144436,
"kl": 0.0621185302734375,
"learning_rate": 4.388888888888889e-06,
"loss": -0.0047,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032921478152275085,
"mask/share_reasoning": 0.8269742727279663,
"mask/share_step_conf": 0.1361980140209198,
"num_tokens": 10186555.0,
"reward": 0.9259449243545532,
"reward_std": 0.14228272438049316,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.8753531575202942,
"rewards/final_brier_reward_step": 0.6624742150306702,
"rewards/format_reward_step": 0.98828125,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.7828581929206848,
"adv/mean_abs_reasoning": 0.5349443554878235,
"adv/mean_abs_step_conf": 0.7610698938369751,
"adv/ratio_final_to_reasoning": 1.4634385518598194,
"adv/ratio_step_to_reasoning": 1.4227085229134617,
"adv/std_final_conf": 0.930379331111908,
"adv/std_reasoning": 0.7753865122795105,
"adv/std_step_conf": 0.9323244690895081,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6944133383103148,
"calib/avg_num_step_conf": 5.51953125,
"calib/ece": 0.25788235294117645,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.4980392156862745,
"calib/gap": 0.1379805897723031,
"calib/mean_conf": 0.8031372549019609,
"calib/mu_c": 0.8648226950354609,
"calib/mu_w": 0.7268421052631578,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2540392156862744,
"calib/std_conf": 0.22322669560713412,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3525706940874036,
"calib/step_q_c_n": 778.0,
"calib/step_q_gap": 0.020271481488978393,
"calib/step_q_w": 0.3322992125984252,
"calib/step_q_w_n": 635.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3071.0,
"completions/max_terminated_length": 3071.0,
"completions/mean_length": 518.61328125,
"completions/mean_terminated_length": 518.61328125,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.030341370031237602,
"kl": 0.0573577880859375,
"learning_rate": 4.361111111111112e-06,
"loss": 0.0182,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03317509591579437,
"mask/share_reasoning": 0.8494628667831421,
"mask/share_step_conf": 0.11736202985048294,
"num_tokens": 10424544.0,
"reward": 0.9504603743553162,
"reward_std": 0.17397907376289368,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/asymmetric_l2_reward": 0.885722279548645,
"rewards/final_brier_reward_step": 0.7058234810829163,
"rewards/format_reward_step": 0.99609375,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.77141273021698,
"adv/mean_abs_reasoning": 0.5115665197372437,
"adv/mean_abs_step_conf": 0.7307957410812378,
"adv/ratio_final_to_reasoning": 1.5079421745840627,
"adv/ratio_step_to_reasoning": 1.4285448966765788,
"adv/std_final_conf": 0.9244205355644226,
"adv/std_reasoning": 0.7575879096984863,
"adv/std_step_conf": 0.9330092072486877,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6712398373983739,
"calib/avg_num_step_conf": 6.0703125,
"calib/ece": 0.33601593625498005,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5179282868525896,
"calib/gap": 0.1214399136178862,
"calib/mean_conf": 0.8069322709163347,
"calib/mu_c": 0.8688617886178862,
"calib/mu_w": 0.747421875,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.32645418326693226,
"calib/std_conf": 0.2188446468993844,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.37137755102040815,
"calib/step_q_c_n": 784.0,
"calib/step_q_gap": 0.04472820037105746,
"calib/step_q_w": 0.3266493506493507,
"calib/step_q_w_n": 770.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2883.0,
"completions/max_terminated_length": 2883.0,
"completions/mean_length": 587.4921875,
"completions/mean_terminated_length": 589.796142578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 250.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.03289037570357323,
"kl": 0.049488067626953125,
"learning_rate": 4.333333333333334e-06,
"loss": -0.061,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.027674272656440735,
"mask/share_reasoning": 0.8532435894012451,
"mask/share_step_conf": 0.11517593264579773,
"num_tokens": 10681262.0,
"reward": 0.9121721982955933,
"reward_std": 0.17224377393722534,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/asymmetric_l2_reward": 0.8826147317886353,
"rewards/final_brier_reward_step": 0.6495422124862671,
"rewards/format_reward_step": 0.98046875,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.726306140422821,
"adv/mean_abs_reasoning": 0.43901243805885315,
"adv/mean_abs_step_conf": 0.7687171101570129,
"adv/ratio_final_to_reasoning": 1.6544090268473302,
"adv/ratio_step_to_reasoning": 1.7510144212678553,
"adv/std_final_conf": 0.9306395053863525,
"adv/std_reasoning": 0.7204497456550598,
"adv/std_step_conf": 0.9319562315940857,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6645899554990464,
"calib/avg_num_step_conf": 6.30078125,
"calib/ece": 0.2612252964426878,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5375494071146245,
"calib/gap": 0.12187412587412572,
"calib/mean_conf": 0.7992490118577076,
"calib/mu_c": 0.8522377622377623,
"calib/mu_w": 0.7303636363636365,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.24762845849802378,
"calib/std_conf": 0.23942253743503325,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3627835051546392,
"calib/step_q_c_n": 873.0,
"calib/step_q_gap": 0.02529701866815276,
"calib/step_q_w": 0.33748648648648644,
"calib/step_q_w_n": 740.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2101.0,
"completions/max_terminated_length": 2101.0,
"completions/mean_length": 542.84765625,
"completions/mean_terminated_length": 544.9765014648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.048,
"grad_norm": 0.031237227842211723,
"kl": 0.054935455322265625,
"learning_rate": 4.305555555555556e-06,
"loss": -0.0223,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.031349748373031616,
"mask/share_reasoning": 0.8371763229370117,
"mask/share_step_conf": 0.12756764888763428,
"num_tokens": 10925279.0,
"reward": 0.9448119401931763,
"reward_std": 0.15327224135398865,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/asymmetric_l2_reward": 0.8864164352416992,
"rewards/final_brier_reward_step": 0.6938323974609375,
"rewards/format_reward_step": 0.98828125,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.7354565858840942,
"adv/mean_abs_reasoning": 0.40667724609375,
"adv/mean_abs_step_conf": 0.7561466693878174,
"adv/ratio_final_to_reasoning": 1.8084527544837161,
"adv/ratio_step_to_reasoning": 1.8593286854645055,
"adv/std_final_conf": 0.919183075428009,
"adv/std_reasoning": 0.6816875338554382,
"adv/std_step_conf": 0.9330542087554932,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6204044117647058,
"calib/avg_num_step_conf": 6.12890625,
"calib/ece": 0.283508064516129,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5362903225806451,
"calib/gap": 0.09378676470588243,
"calib/mean_conf": 0.7820564516129033,
"calib/mu_c": 0.8244117647058824,
"calib/mu_w": 0.730625,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2585887096774193,
"calib/std_conf": 0.2628921894329086,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.35542750929368033,
"calib/step_q_c_n": 807.0,
"calib/step_q_gap": 0.02335401848003199,
"calib/step_q_w": 0.33207349081364834,
"calib/step_q_w_n": 762.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2446.0,
"completions/max_terminated_length": 2446.0,
"completions/mean_length": 589.328125,
"completions/mean_terminated_length": 589.328125,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.0332394540309906,
"kl": 0.04784393310546875,
"learning_rate": 4.277777777777778e-06,
"loss": -0.0168,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.032339829951524734,
"mask/share_reasoning": 0.8446968197822571,
"mask/share_step_conf": 0.12296333909034729,
"num_tokens": 11180915.0,
"reward": 0.9091041088104248,
"reward_std": 0.16267403960227966,
"rewards/accuracy_reward_step": 0.53125,
"rewards/asymmetric_l2_reward": 0.8642227053642273,
"rewards/final_brier_reward_step": 0.6539855599403381,
"rewards/format_reward_step": 0.96875,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.7560012340545654,
"adv/mean_abs_reasoning": 0.48656049370765686,
"adv/mean_abs_step_conf": 0.7640652656555176,
"adv/ratio_final_to_reasoning": 1.553766168506065,
"adv/ratio_step_to_reasoning": 1.5703397121974223,
"adv/std_final_conf": 0.9190072417259216,
"adv/std_reasoning": 0.7393056750297546,
"adv/std_step_conf": 0.932059109210968,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7265749601275917,
"calib/avg_num_step_conf": 6.23828125,
"calib/ece": 0.1629083665338646,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.47808764940239046,
"calib/gap": 0.20046318447634248,
"calib/mean_conf": 0.7527091633466135,
"calib/mu_c": 0.8317763157894738,
"calib/mu_w": 0.6313131313131313,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15501992031872514,
"calib/std_conf": 0.25110399877916256,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.35576086956521746,
"calib/step_q_c_n": 920.0,
"calib/step_q_gap": 0.020487605163444944,
"calib/step_q_w": 0.3352732644017725,
"calib/step_q_w_n": 677.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2268.0,
"completions/max_terminated_length": 2268.0,
"completions/mean_length": 584.73828125,
"completions/mean_terminated_length": 589.342529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.05675409361720085,
"kl": 0.051654815673828125,
"learning_rate": 4.25e-06,
"loss": -0.0528,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.028933309018611908,
"mask/share_reasoning": 0.8470029830932617,
"mask/share_step_conf": 0.11625122278928757,
"num_tokens": 11436584.0,
"reward": 0.9713031053543091,
"reward_std": 0.14311102032661438,
"rewards/accuracy_reward_step": 0.59375,
"rewards/asymmetric_l2_reward": 0.87469881772995,
"rewards/final_brier_reward_step": 0.7538449168205261,
"rewards/format_reward_step": 0.9765625,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.7458685040473938,
"adv/mean_abs_reasoning": 0.45874661207199097,
"adv/mean_abs_step_conf": 0.7546839118003845,
"adv/ratio_final_to_reasoning": 1.625883405827409,
"adv/ratio_step_to_reasoning": 1.645099695432633,
"adv/std_final_conf": 0.9048066139221191,
"adv/std_reasoning": 0.7014490365982056,
"adv/std_step_conf": 0.9324750304222107,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6762935450819672,
"calib/avg_num_step_conf": 5.29296875,
"calib/ece": 0.2634400000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.52,
"calib/gap": 0.15932248975409835,
"calib/mean_conf": 0.74936,
"calib/mu_c": 0.827109375,
"calib/mu_w": 0.6677868852459017,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.25040000000000007,
"calib/std_conf": 0.2668115259879153,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.38141732283464563,
"calib/step_q_c_n": 635.0,
"calib/step_q_gap": 0.009236767279090063,
"calib/step_q_w": 0.37218055555555557,
"calib/step_q_w_n": 720.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2772.0,
"completions/max_terminated_length": 2772.0,
"completions/mean_length": 515.4296875,
"completions/mean_terminated_length": 517.4509887695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.0512,
"grad_norm": 0.041724901646375656,
"kl": 0.057010650634765625,
"learning_rate": 4.222222222222223e-06,
"loss": -0.0685,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.033603981137275696,
"mask/share_reasoning": 0.8387103080749512,
"mask/share_step_conf": 0.12377943843603134,
"num_tokens": 11672222.0,
"reward": 0.919036865234375,
"reward_std": 0.13520202040672302,
"rewards/accuracy_reward_step": 0.5,
"rewards/asymmetric_l2_reward": 0.8569885492324829,
"rewards/final_brier_reward_step": 0.6857726573944092,
"rewards/format_reward_step": 0.9765625,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.7114934325218201,
"adv/mean_abs_reasoning": 0.46754950284957886,
"adv/mean_abs_step_conf": 0.7762830257415771,
"adv/ratio_final_to_reasoning": 1.5217499498672837,
"adv/ratio_step_to_reasoning": 1.66032264179591,
"adv/std_final_conf": 0.868726372718811,
"adv/std_reasoning": 0.7015178799629211,
"adv/std_step_conf": 0.9322298765182495,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.682031040941932,
"calib/avg_num_step_conf": 5.69140625,
"calib/ece": 0.28253012048192777,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.714859437751004,
"calib/gap": 0.136437650521809,
"calib/mean_conf": 0.8418875502008032,
"calib/mu_c": 0.8972297297297297,
"calib/mu_w": 0.7607920792079207,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.2650200803212852,
"calib/std_conf": 0.23948583521095618,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4151843043995244,
"calib/step_q_c_n": 841.0,
"calib/step_q_gap": 0.04015183686705692,
"calib/step_q_w": 0.3750324675324675,
"calib/step_q_w_n": 616.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2635.0,
"completions/max_terminated_length": 2635.0,
"completions/mean_length": 529.74609375,
"completions/mean_terminated_length": 531.8235473632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 208.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.04617791995406151,
"kl": 0.058994293212890625,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0631,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03090585768222809,
"mask/share_reasoning": 0.8450629711151123,
"mask/share_step_conf": 0.12012490630149841,
"num_tokens": 11912373.0,
"reward": 0.9283610582351685,
"reward_std": 0.18739992380142212,
"rewards/accuracy_reward_step": 0.578125,
"rewards/asymmetric_l2_reward": 0.8635416626930237,
"rewards/final_brier_reward_step": 0.6838054656982422,
"rewards/format_reward_step": 0.96875,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.6450693607330322,
"adv/mean_abs_reasoning": 0.384204626083374,
"adv/mean_abs_step_conf": 0.7576757073402405,
"adv/ratio_final_to_reasoning": 1.6789734348306609,
"adv/ratio_step_to_reasoning": 1.972062947456082,
"adv/std_final_conf": 0.846005380153656,
"adv/std_reasoning": 0.6612586975097656,
"adv/std_step_conf": 0.9314751625061035,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7366114230927269,
"calib/avg_num_step_conf": 5.6953125,
"calib/ece": 0.24157894736842106,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.680161943319838,
"calib/gap": 0.14676003287220918,
"calib/mean_conf": 0.8274089068825912,
"calib/mu_c": 0.8856375838926173,
"calib/mu_w": 0.7388775510204081,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2328744939271255,
"calib/std_conf": 0.24239478038170492,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.43133171912832935,
"calib/step_q_c_n": 826.0,
"calib/step_q_gap": 0.046189314065038234,
"calib/step_q_w": 0.3851424050632911,
"calib/step_q_w_n": 632.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2149.0,
"completions/max_terminated_length": 2149.0,
"completions/mean_length": 552.40234375,
"completions/mean_terminated_length": 558.9525756835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.07475942373275757,
"kl": 0.048542022705078125,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0752,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.030771994963288307,
"mask/share_reasoning": 0.8386844396591187,
"mask/share_step_conf": 0.118824802339077,
"num_tokens": 12159148.0,
"reward": 0.9325626492500305,
"reward_std": 0.1499963104724884,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.863436222076416,
"rewards/final_brier_reward_step": 0.693095326423645,
"rewards/format_reward_step": 0.9609375,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.6528294086456299,
"adv/mean_abs_reasoning": 0.4469456076622009,
"adv/mean_abs_step_conf": 0.7476691007614136,
"adv/ratio_final_to_reasoning": 1.4606462116505121,
"adv/ratio_step_to_reasoning": 1.672841365803281,
"adv/std_final_conf": 0.8555540442466736,
"adv/std_reasoning": 0.7392084002494812,
"adv/std_step_conf": 0.9323121309280396,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7527228581338884,
"calib/avg_num_step_conf": 5.23828125,
"calib/ece": 0.18209677419354817,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6330645161290323,
"calib/gap": 0.23286021505376342,
"calib/mean_conf": 0.7982258064516129,
"calib/mu_c": 0.8855483870967741,
"calib/mu_w": 0.6526881720430107,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1776612903225804,
"calib/std_conf": 0.2629716910653483,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.430625,
"calib/step_q_c_n": 816.0,
"calib/step_q_gap": 0.032586904761904734,
"calib/step_q_w": 0.39803809523809525,
"calib/step_q_w_n": 525.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2269.0,
"completions/max_terminated_length": 2269.0,
"completions/mean_length": 546.97265625,
"completions/mean_terminated_length": 546.97265625,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.0544,
"grad_norm": 0.04614703357219696,
"kl": 0.046630859375,
"learning_rate": 4.138888888888889e-06,
"loss": -0.0507,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03051183931529522,
"mask/share_reasoning": 0.8613395690917969,
"mask/share_step_conf": 0.10814858973026276,
"num_tokens": 12408469.0,
"reward": 0.9670203924179077,
"reward_std": 0.17126522958278656,
"rewards/accuracy_reward_step": 0.609375,
"rewards/asymmetric_l2_reward": 0.8670368194580078,
"rewards/final_brier_reward_step": 0.7513788938522339,
"rewards/format_reward_step": 0.96875,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.5877702832221985,
"adv/mean_abs_reasoning": 0.37581461668014526,
"adv/mean_abs_step_conf": 0.7422032356262207,
"adv/ratio_final_to_reasoning": 1.563989949125497,
"adv/ratio_step_to_reasoning": 1.9749184908843174,
"adv/std_final_conf": 0.8321949243545532,
"adv/std_reasoning": 0.6612822413444519,
"adv/std_step_conf": 0.9306304454803467,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7521001344086022,
"calib/avg_num_step_conf": 5.41796875,
"calib/ece": 0.13200000000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.548,
"calib/gap": 0.2936441532258066,
"calib/mean_conf": 0.7214400000000001,
"calib/mu_c": 0.7966129032258066,
"calib/mu_w": 0.50296875,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.05472000000000001,
"calib/std_conf": 0.3111467923665613,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.43921052631578944,
"calib/step_q_c_n": 988.0,
"calib/step_q_gap": 0.0395864661654135,
"calib/step_q_w": 0.39962406015037594,
"calib/step_q_w_n": 399.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1785.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 542.125,
"completions/mean_terminated_length": 546.3936767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.055466666666666664,
"grad_norm": 9.542619705200195,
"kl": 10.608467102050781,
"learning_rate": 4.111111111111111e-06,
"loss": 0.0357,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.030428439378738403,
"mask/share_reasoning": 0.8515126705169678,
"mask/share_step_conf": 0.11024642735719681,
"num_tokens": 12655205.0,
"reward": 1.0111010074615479,
"reward_std": 0.15742525458335876,
"rewards/accuracy_reward_step": 0.73046875,
"rewards/asymmetric_l2_reward": 0.8825316429138184,
"rewards/final_brier_reward_step": 0.7998265624046326,
"rewards/format_reward_step": 0.96875,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.6738765239715576,
"adv/mean_abs_reasoning": 0.5075792074203491,
"adv/mean_abs_step_conf": 0.7184747457504272,
"adv/ratio_final_to_reasoning": 1.3276283073067061,
"adv/ratio_step_to_reasoning": 1.4154928634722936,
"adv/std_final_conf": 0.8424535989761353,
"adv/std_reasoning": 0.7575823068618774,
"adv/std_step_conf": 0.9306389093399048,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6674550299800133,
"calib/avg_num_step_conf": 5.80859375,
"calib/ece": 0.20988142292490114,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6284584980237155,
"calib/gap": 0.16155629580279796,
"calib/mean_conf": 0.7785770750988144,
"calib/mu_c": 0.8392405063291137,
"calib/mu_w": 0.6776842105263158,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18197628458498022,
"calib/std_conf": 0.29109910124762695,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4375678496868476,
"calib/step_q_c_n": 958.0,
"calib/step_q_gap": 0.016811706019550843,
"calib/step_q_w": 0.42075614366729674,
"calib/step_q_w_n": 529.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2188.0,
"completions/max_terminated_length": 2188.0,
"completions/mean_length": 545.59375,
"completions/mean_terminated_length": 545.59375,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.06023424491286278,
"kl": 0.09381866455078125,
"learning_rate": 4.083333333333334e-06,
"loss": -0.0644,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03022611513733864,
"mask/share_reasoning": 0.853451132774353,
"mask/share_step_conf": 0.11632277071475983,
"num_tokens": 12900701.0,
"reward": 0.963081955909729,
"reward_std": 0.16977277398109436,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/asymmetric_l2_reward": 0.8808630704879761,
"rewards/final_brier_reward_step": 0.7242070436477661,
"rewards/format_reward_step": 0.98828125,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.5761524438858032,
"adv/mean_abs_reasoning": 0.3302342891693115,
"adv/mean_abs_step_conf": 0.7532912492752075,
"adv/ratio_final_to_reasoning": 1.744677832623278,
"adv/ratio_step_to_reasoning": 2.281081262548706,
"adv/std_final_conf": 0.7929055690765381,
"adv/std_reasoning": 0.596068799495697,
"adv/std_step_conf": 0.9317674040794373,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.8392947834288617,
"calib/avg_num_step_conf": 5.21875,
"calib/ece": 0.11992187500000008,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.64453125,
"calib/gap": 0.34614089820793725,
"calib/mean_conf": 0.8008593750000002,
"calib/mu_c": 0.9049720670391062,
"calib/mu_w": 0.5588311688311689,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11078125000000008,
"calib/std_conf": 0.27467460107299574,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.46659142212189614,
"calib/step_q_c_n": 886.0,
"calib/step_q_gap": 0.04299142212189622,
"calib/step_q_w": 0.4235999999999999,
"calib/step_q_w_n": 450.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1258.0,
"completions/max_terminated_length": 1258.0,
"completions/mean_length": 469.8203125,
"completions/mean_terminated_length": 471.66278076171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.0576,
"grad_norm": 0.14687201380729675,
"kl": 0.05890655517578125,
"learning_rate": 4.055555555555556e-06,
"loss": -0.0315,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.034429773688316345,
"mask/share_reasoning": 0.8422298431396484,
"mask/share_step_conf": 0.11943414062261581,
"num_tokens": 13127207.0,
"reward": 1.0360541343688965,
"reward_std": 0.09538309276103973,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/asymmetric_l2_reward": 0.8827582597732544,
"rewards/final_brier_reward_step": 0.8495062589645386,
"rewards/format_reward_step": 1.0,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.61080002784729,
"adv/mean_abs_reasoning": 0.3844855725765228,
"adv/mean_abs_step_conf": 0.7339929342269897,
"adv/ratio_final_to_reasoning": 1.5886162483397857,
"adv/ratio_step_to_reasoning": 1.9090259468212054,
"adv/std_final_conf": 0.8445244431495667,
"adv/std_reasoning": 0.6815156936645508,
"adv/std_step_conf": 0.9316856861114502,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8308004052684904,
"calib/avg_num_step_conf": 4.9296875,
"calib/ece": 0.25086956521739145,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6482213438735178,
"calib/gap": 0.2831933890577505,
"calib/mean_conf": 0.7950592885375494,
"calib/mu_c": 0.9204255319148934,
"calib/mu_w": 0.6372321428571429,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.24430830039525706,
"calib/std_conf": 0.2872626988401877,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.47453996983408747,
"calib/step_q_c_n": 663.0,
"calib/step_q_gap": 0.07060007000103241,
"calib/step_q_w": 0.40393989983305506,
"calib/step_q_w_n": 599.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2188.0,
"completions/max_terminated_length": 2188.0,
"completions/mean_length": 499.4375,
"completions/mean_terminated_length": 501.3961181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.06633848696947098,
"kl": 0.06307220458984375,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0112,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03279150649905205,
"mask/share_reasoning": 0.8528703451156616,
"mask/share_step_conf": 0.11043195426464081,
"num_tokens": 13362887.0,
"reward": 0.9695348739624023,
"reward_std": 0.16215607523918152,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/asymmetric_l2_reward": 0.8861154317855835,
"rewards/final_brier_reward_step": 0.745141863822937,
"rewards/format_reward_step": 0.98828125,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.6004599332809448,
"adv/mean_abs_reasoning": 0.48417240381240845,
"adv/mean_abs_step_conf": 0.7438913583755493,
"adv/ratio_final_to_reasoning": 1.2401779377611777,
"adv/ratio_step_to_reasoning": 1.536418334704942,
"adv/std_final_conf": 0.8093182444572449,
"adv/std_reasoning": 0.7393047213554382,
"adv/std_step_conf": 0.9322188496589661,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6902107758344544,
"calib/avg_num_step_conf": 5.4765625,
"calib/ece": 0.40175999999999995,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.844,
"calib/gap": 0.11437247741687495,
"calib/mean_conf": 0.9097600000000001,
"calib/mu_c": 0.9651162790697675,
"calib/mu_w": 0.8507438016528925,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.39775999999999995,
"calib/std_conf": 0.21169020383569948,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4875795297372061,
"calib/step_q_c_n": 723.0,
"calib/step_q_gap": 0.03625405109213975,
"calib/step_q_w": 0.45132547864506634,
"calib/step_q_w_n": 679.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2483.0,
"completions/max_terminated_length": 2483.0,
"completions/mean_length": 526.91796875,
"completions/mean_terminated_length": 528.984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.1122933179140091,
"kl": 0.05873870849609375,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0373,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.032018400728702545,
"mask/share_reasoning": 0.8491096496582031,
"mask/share_step_conf": 0.11496569961309433,
"num_tokens": 13604618.0,
"reward": 0.8735764026641846,
"reward_std": 0.1838463842868805,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/asymmetric_l2_reward": 0.8586329221725464,
"rewards/final_brier_reward_step": 0.5932074189186096,
"rewards/format_reward_step": 0.97265625,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.5015829205513,
"adv/mean_abs_reasoning": 0.3861631751060486,
"adv/mean_abs_step_conf": 0.7528830766677856,
"adv/ratio_final_to_reasoning": 1.2988885343962555,
"adv/ratio_step_to_reasoning": 1.949650109596877,
"adv/std_final_conf": 0.742138147354126,
"adv/std_reasoning": 0.6614132523536682,
"adv/std_step_conf": 0.9319602251052856,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7295296167247386,
"calib/avg_num_step_conf": 5.07421875,
"calib/ece": 0.27711999999999987,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.88,
"calib/gap": 0.10993176538908245,
"calib/mean_conf": 0.93424,
"calib/mu_c": 0.970297619047619,
"calib/mu_w": 0.8603658536585366,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.26967999999999986,
"calib/std_conf": 0.17799444485713592,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5003476245654692,
"calib/step_q_c_n": 863.0,
"calib/step_q_gap": 0.05931551447372613,
"calib/step_q_w": 0.4410321100917431,
"calib/step_q_w_n": 436.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2508.0,
"completions/max_terminated_length": 2508.0,
"completions/mean_length": 518.72265625,
"completions/mean_terminated_length": 524.87353515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.0608,
"grad_norm": 0.05452043563127518,
"kl": 0.05272674560546875,
"learning_rate": 3.972222222222223e-06,
"loss": -0.0571,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03176348656415939,
"mask/share_reasoning": 0.8486981987953186,
"mask/share_step_conf": 0.10781954228878021,
"num_tokens": 13844203.0,
"reward": 0.9544321894645691,
"reward_std": 0.1735970377922058,
"rewards/accuracy_reward_step": 0.65625,
"rewards/asymmetric_l2_reward": 0.8717612028121948,
"rewards/final_brier_reward_step": 0.7105406522750854,
"rewards/format_reward_step": 0.9765625,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.653085470199585,
"adv/mean_abs_reasoning": 0.5664516687393188,
"adv/mean_abs_step_conf": 0.7543550133705139,
"adv/ratio_final_to_reasoning": 1.1529412061810609,
"adv/ratio_step_to_reasoning": 1.331719994839786,
"adv/std_final_conf": 0.8459213376045227,
"adv/std_reasoning": 0.7929334044456482,
"adv/std_step_conf": 0.9330338835716248,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5168343526007759,
"calib/avg_num_step_conf": 6.296875,
"calib/ece": 0.37165322580645166,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.7903225806451613,
"calib/gap": 0.026798842638258824,
"calib/mean_conf": 0.8870564516129033,
"calib/mu_c": 0.899051094890511,
"calib/mu_w": 0.8722522522522522,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.35314516129032264,
"calib/std_conf": 0.22490635335836415,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.4674119076549211,
"calib/step_q_c_n": 823.0,
"calib/step_q_gap": 0.012785798656188507,
"calib/step_q_w": 0.45462610899873257,
"calib/step_q_w_n": 789.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2451.0,
"completions/max_terminated_length": 2451.0,
"completions/mean_length": 631.66796875,
"completions/mean_terminated_length": 634.1451416015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.0428379587829113,
"kl": 0.039905548095703125,
"learning_rate": 3.944444444444445e-06,
"loss": -0.0225,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.02726609632372856,
"mask/share_reasoning": 0.8649106621742249,
"mask/share_step_conf": 0.10391701012849808,
"num_tokens": 14112230.0,
"reward": 0.8503095507621765,
"reward_std": 0.23419946432113647,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/asymmetric_l2_reward": 0.8246171474456787,
"rewards/final_brier_reward_step": 0.5775644779205322,
"rewards/format_reward_step": 0.95703125,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.6193721890449524,
"adv/mean_abs_reasoning": 0.4775833189487457,
"adv/mean_abs_step_conf": 0.7455660104751587,
"adv/ratio_final_to_reasoning": 1.2968882380739588,
"adv/ratio_step_to_reasoning": 1.561122386176082,
"adv/std_final_conf": 0.8122193217277527,
"adv/std_reasoning": 0.7393720746040344,
"adv/std_step_conf": 0.933074951171875,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6466128807900959,
"calib/avg_num_step_conf": 5.2734375,
"calib/ece": 0.3228112449799195,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.8273092369477911,
"calib/gap": 0.058149951314508286,
"calib/mean_conf": 0.9030522088353414,
"calib/mu_c": 0.9243037974683543,
"calib/mu_w": 0.866153846153846,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2956626506024094,
"calib/std_conf": 0.2181105456543611,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5174651162790698,
"calib/step_q_c_n": 860.0,
"calib/step_q_gap": 0.03366919791172285,
"calib/step_q_w": 0.4837959183673469,
"calib/step_q_w_n": 490.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2220.0,
"completions/max_terminated_length": 2220.0,
"completions/mean_length": 548.95703125,
"completions/mean_terminated_length": 553.279541015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.05327894538640976,
"kl": 0.057308197021484375,
"learning_rate": 3.916666666666667e-06,
"loss": -0.0296,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03187629580497742,
"mask/share_reasoning": 0.85688316822052,
"mask/share_step_conf": 0.10342804342508316,
"num_tokens": 14359011.0,
"reward": 0.9090771079063416,
"reward_std": 0.21725571155548096,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/asymmetric_l2_reward": 0.8475908041000366,
"rewards/final_brier_reward_step": 0.6533757448196411,
"rewards/format_reward_step": 0.96875,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.6204248070716858,
"adv/mean_abs_reasoning": 0.4606407880783081,
"adv/mean_abs_step_conf": 0.7363015413284302,
"adv/ratio_final_to_reasoning": 1.346873362343707,
"adv/ratio_step_to_reasoning": 1.5984288851191792,
"adv/std_final_conf": 0.8465948104858398,
"adv/std_reasoning": 0.7391853928565979,
"adv/std_step_conf": 0.9324144124984741,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6778369905956112,
"calib/avg_num_step_conf": 4.95703125,
"calib/ece": 0.3437254901960784,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.796078431372549,
"calib/gap": 0.1016175548589342,
"calib/mean_conf": 0.8845098039215686,
"calib/mu_c": 0.928344827586207,
"calib/mu_w": 0.8267272727272728,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3298039215686274,
"calib/std_conf": 0.23504954609522094,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5443636363636363,
"calib/step_q_c_n": 660.0,
"calib/step_q_gap": 0.06410091058366907,
"calib/step_q_w": 0.4802627257799672,
"calib/step_q_w_n": 609.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2740.0,
"completions/max_terminated_length": 2740.0,
"completions/mean_length": 518.02734375,
"completions/mean_terminated_length": 518.02734375,
"completions/min_length": 232.0,
"completions/min_terminated_length": 232.0,
"epoch": 0.064,
"grad_norm": 0.04044271260499954,
"kl": 0.048015594482421875,
"learning_rate": 3.88888888888889e-06,
"loss": -0.002,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03257352113723755,
"mask/share_reasoning": 0.8600834012031555,
"mask/share_step_conf": 0.10734307020902634,
"num_tokens": 14600482.0,
"reward": 0.9168381690979004,
"reward_std": 0.17467540502548218,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/asymmetric_l2_reward": 0.8741821050643921,
"rewards/final_brier_reward_step": 0.6469941139221191,
"rewards/format_reward_step": 0.99609375,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.5063987374305725,
"adv/mean_abs_reasoning": 0.3942033052444458,
"adv/mean_abs_step_conf": 0.7471913695335388,
"adv/ratio_final_to_reasoning": 1.284613118899534,
"adv/ratio_step_to_reasoning": 1.8954467392661887,
"adv/std_final_conf": 0.7494310140609741,
"adv/std_reasoning": 0.6815856695175171,
"adv/std_step_conf": 0.9329997897148132,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6456996148908857,
"calib/avg_num_step_conf": 4.96875,
"calib/ece": 0.26280632411067195,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.8537549407114624,
"calib/gap": 0.10839751818570809,
"calib/mean_conf": 0.9224110671936759,
"calib/mu_c": 0.9575438596491228,
"calib/mu_w": 0.8491463414634147,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2546640316205534,
"calib/std_conf": 0.18691619784239447,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5392926829268293,
"calib/step_q_c_n": 820.0,
"calib/step_q_gap": 0.07070861213036911,
"calib/step_q_w": 0.4685840707964602,
"calib/step_q_w_n": 452.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2139.0,
"completions/max_terminated_length": 2139.0,
"completions/mean_length": 446.3984375,
"completions/mean_terminated_length": 446.3984375,
"completions/min_length": 124.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.039539139717817307,
"kl": 0.0517120361328125,
"learning_rate": 3.861111111111112e-06,
"loss": -0.0068,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.038807108998298645,
"mask/share_reasoning": 0.8420302867889404,
"mask/share_step_conf": 0.11916261911392212,
"num_tokens": 14818824.0,
"reward": 0.9512588381767273,
"reward_std": 0.16452768445014954,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/asymmetric_l2_reward": 0.8505984544754028,
"rewards/final_brier_reward_step": 0.7214503288269043,
"rewards/format_reward_step": 0.984375,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.7046828269958496,
"adv/mean_abs_reasoning": 0.6048574447631836,
"adv/mean_abs_step_conf": 0.782432496547699,
"adv/ratio_final_to_reasoning": 1.165039519802472,
"adv/ratio_step_to_reasoning": 1.293581658491515,
"adv/std_final_conf": 0.8792684078216553,
"adv/std_reasoning": 0.8099877238273621,
"adv/std_step_conf": 0.9344117045402527,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.590156823490157,
"calib/avg_num_step_conf": 5.34375,
"calib/ece": 0.3766260162601627,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.7479674796747967,
"calib/gap": 0.04934334334334367,
"calib/mean_conf": 0.8574390243902439,
"calib/mu_c": 0.8797037037037038,
"calib/mu_w": 0.8303603603603601,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.3426422764227643,
"calib/std_conf": 0.25203470035773656,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5694727592267135,
"calib/step_q_c_n": 569.0,
"calib/step_q_gap": 0.0766567392016822,
"calib/step_q_w": 0.4928160200250313,
"calib/step_q_w_n": 799.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2655.0,
"completions/max_terminated_length": 2655.0,
"completions/mean_length": 535.71875,
"completions/mean_terminated_length": 539.93701171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.05797210708260536,
"kl": 0.049541473388671875,
"learning_rate": 3.833333333333334e-06,
"loss": -0.0318,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03127940744161606,
"mask/share_reasoning": 0.858956515789032,
"mask/share_step_conf": 0.10195156186819077,
"num_tokens": 15063048.0,
"reward": 0.8401246070861816,
"reward_std": 0.255887508392334,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/asymmetric_l2_reward": 0.7979668378829956,
"rewards/final_brier_reward_step": 0.5869699120521545,
"rewards/format_reward_step": 0.94921875,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.7219770550727844,
"adv/mean_abs_reasoning": 0.5437690019607544,
"adv/mean_abs_step_conf": 0.7378803491592407,
"adv/ratio_final_to_reasoning": 1.3277274954428018,
"adv/ratio_step_to_reasoning": 1.3569739107940102,
"adv/std_final_conf": 0.8784008622169495,
"adv/std_reasoning": 0.7928453683853149,
"adv/std_step_conf": 0.9337167739868164,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7164169119614665,
"calib/avg_num_step_conf": 5.10546875,
"calib/ece": 0.2565194109772422,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6706827309236948,
"calib/gap": 0.1969813129961645,
"calib/mean_conf": 0.821338688085676,
"calib/mu_c": 0.9012387387387387,
"calib/mu_w": 0.7042574257425742,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2417402945113787,
"calib/std_conf": 0.25969706026981004,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5517697841726619,
"calib/step_q_c_n": 695.0,
"calib/step_q_gap": 0.06320769266939391,
"calib/step_q_w": 0.48856209150326796,
"calib/step_q_w_n": 612.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2813.0,
"completions/max_terminated_length": 2813.0,
"completions/mean_length": 574.24609375,
"completions/mean_terminated_length": 576.498046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.0672,
"grad_norm": 0.043521128594875336,
"kl": 0.037807464599609375,
"learning_rate": 3.8055555555555556e-06,
"loss": -0.0087,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.031664442270994186,
"mask/share_reasoning": 0.8629859089851379,
"mask/share_step_conf": 0.10144336521625519,
"num_tokens": 15318695.0,
"reward": 0.932765007019043,
"reward_std": 0.20682235062122345,
"rewards/accuracy_reward_step": 0.578125,
"rewards/asymmetric_l2_reward": 0.8452221155166626,
"rewards/final_brier_reward_step": 0.7109330296516418,
"rewards/format_reward_step": 0.96875,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.6922527551651001,
"adv/mean_abs_reasoning": 0.5614722967147827,
"adv/mean_abs_step_conf": 0.7511861324310303,
"adv/ratio_final_to_reasoning": 1.2329241517622933,
"adv/ratio_step_to_reasoning": 1.3378863691517422,
"adv/std_final_conf": 0.8831756711006165,
"adv/std_reasoning": 0.7928008437156677,
"adv/std_step_conf": 0.9331434369087219,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6446036498431708,
"calib/avg_num_step_conf": 5.12109375,
"calib/ece": 0.1753386454183265,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6254980079681275,
"calib/gap": 0.18141930424864539,
"calib/mean_conf": 0.7977290836653387,
"calib/mu_c": 0.858443113772455,
"calib/mu_w": 0.6770238095238096,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.15386454183266915,
"calib/std_conf": 0.2702736224399446,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5586697782963826,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": 0.035542024992417875,
"calib/step_q_w": 0.5231277533039648,
"calib/step_q_w_n": 454.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2473.0,
"completions/max_terminated_length": 2473.0,
"completions/mean_length": 513.83984375,
"completions/mean_terminated_length": 515.8549194335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 207.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.15224145352840424,
"kl": 0.09376144409179688,
"learning_rate": 3.777777777777778e-06,
"loss": 0.0487,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.032815635204315186,
"mask/share_reasoning": 0.8548951148986816,
"mask/share_step_conf": 0.10838305950164795,
"num_tokens": 15554014.0,
"reward": 0.9618469476699829,
"reward_std": 0.19402143359184265,
"rewards/accuracy_reward_step": 0.65625,
"rewards/asymmetric_l2_reward": 0.8472946882247925,
"rewards/final_brier_reward_step": 0.7498366832733154,
"rewards/format_reward_step": 0.9765625,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.6335813403129578,
"adv/mean_abs_reasoning": 0.36810600757598877,
"adv/mean_abs_step_conf": 0.7440919876098633,
"adv/ratio_final_to_reasoning": 1.7211926110229714,
"adv/ratio_step_to_reasoning": 2.021406802105122,
"adv/std_final_conf": 0.8394150733947754,
"adv/std_reasoning": 0.6611788272857666,
"adv/std_step_conf": 0.933049201965332,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7085637993515164,
"calib/avg_num_step_conf": 4.49609375,
"calib/ece": 0.28799212598425206,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7086614173228346,
"calib/gap": 0.13941445737173375,
"calib/mean_conf": 0.8620866141732284,
"calib/mu_c": 0.9208163265306123,
"calib/mu_w": 0.7814018691588785,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.28566929133858276,
"calib/std_conf": 0.2172899845365054,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6184561891515994,
"calib/step_q_c_n": 719.0,
"calib/step_q_gap": 0.007021003966414252,
"calib/step_q_w": 0.6114351851851851,
"calib/step_q_w_n": 432.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1244.0,
"completions/max_terminated_length": 1244.0,
"completions/mean_length": 425.75,
"completions/mean_terminated_length": 427.4196472167969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.05041688680648804,
"kl": 0.04650115966796875,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.044,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.038847897201776505,
"mask/share_reasoning": 0.8437234163284302,
"mask/share_step_conf": 0.1135224848985672,
"num_tokens": 15768030.0,
"reward": 0.9121578335762024,
"reward_std": 0.1568649709224701,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/asymmetric_l2_reward": 0.8197988271713257,
"rewards/final_brier_reward_step": 0.6912355422973633,
"rewards/format_reward_step": 0.9921875,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.699600338935852,
"adv/mean_abs_reasoning": 0.516379177570343,
"adv/mean_abs_step_conf": 0.7368757724761963,
"adv/ratio_final_to_reasoning": 1.3548190347790505,
"adv/ratio_step_to_reasoning": 1.4270052017653567,
"adv/std_final_conf": 0.8724037408828735,
"adv/std_reasoning": 0.7927989959716797,
"adv/std_step_conf": 0.9339161515235901,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6934820904286553,
"calib/avg_num_step_conf": 5.796875,
"calib/ece": 0.20552419354838708,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.4475806451612903,
"calib/gap": 0.17645005545768921,
"calib/mean_conf": 0.7271370967741936,
"calib/mu_c": 0.810381679389313,
"calib/mu_w": 0.6339316239316238,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20221774193548386,
"calib/std_conf": 0.26080417924978105,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6141432791728212,
"calib/step_q_c_n": 677.0,
"calib/step_q_gap": 0.06765505116786441,
"calib/step_q_w": 0.5464882280049568,
"calib/step_q_w_n": 807.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2696.0,
"completions/max_terminated_length": 2696.0,
"completions/mean_length": 548.875,
"completions/mean_terminated_length": 548.875,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.0704,
"grad_norm": 0.03213903680443764,
"kl": 0.04193115234375,
"learning_rate": 3.7222222222222225e-06,
"loss": 0.1051,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03228248655796051,
"mask/share_reasoning": 0.8526995182037354,
"mask/share_step_conf": 0.11501805484294891,
"num_tokens": 16014894.0,
"reward": 0.9093428254127502,
"reward_std": 0.187605082988739,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/asymmetric_l2_reward": 0.8142845034599304,
"rewards/final_brier_reward_step": 0.7083073854446411,
"rewards/format_reward_step": 0.96875,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.6824854612350464,
"adv/mean_abs_reasoning": 0.29309481382369995,
"adv/mean_abs_step_conf": 0.7393745183944702,
"adv/ratio_final_to_reasoning": 2.328548404973039,
"adv/ratio_step_to_reasoning": 2.5226462002129213,
"adv/std_final_conf": 0.870021402835846,
"adv/std_reasoning": 0.6184077858924866,
"adv/std_step_conf": 0.9328907132148743,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8083670715249662,
"calib/avg_num_step_conf": 4.8515625,
"calib/ece": 0.13792828685258968,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.47410358565737054,
"calib/gap": 0.27966531713900145,
"calib/mean_conf": 0.7428685258964144,
"calib/mu_c": 0.8487179487179489,
"calib/mu_w": 0.5690526315789475,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12964143426294825,
"calib/std_conf": 0.2607859302946761,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6446345177664974,
"calib/step_q_c_n": 788.0,
"calib/step_q_gap": 0.061043768867818926,
"calib/step_q_w": 0.5835907488986785,
"calib/step_q_w_n": 454.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2463.0,
"completions/max_terminated_length": 2463.0,
"completions/mean_length": 531.15625,
"completions/mean_terminated_length": 531.15625,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.045603763312101364,
"kl": 0.057559967041015625,
"learning_rate": 3.694444444444445e-06,
"loss": 0.0319,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03274885565042496,
"mask/share_reasoning": 0.8626049160957336,
"mask/share_step_conf": 0.10464620590209961,
"num_tokens": 16255878.0,
"reward": 0.9663029909133911,
"reward_std": 0.13919922709465027,
"rewards/accuracy_reward_step": 0.609375,
"rewards/asymmetric_l2_reward": 0.8169246912002563,
"rewards/final_brier_reward_step": 0.7977124452590942,
"rewards/format_reward_step": 0.98046875,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.729244589805603,
"adv/mean_abs_reasoning": 0.5466316938400269,
"adv/mean_abs_step_conf": 0.739362895488739,
"adv/ratio_final_to_reasoning": 1.3340693524057137,
"adv/ratio_step_to_reasoning": 1.3525796323568378,
"adv/std_final_conf": 0.910574197769165,
"adv/std_reasoning": 0.7928569316864014,
"adv/std_step_conf": 0.9348320960998535,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6934589041095891,
"calib/avg_num_step_conf": 5.1171875,
"calib/ece": 0.14150406504065038,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.4024390243902439,
"calib/gap": 0.18019315068493158,
"calib/mean_conf": 0.7252439024390244,
"calib/mu_c": 0.7984931506849315,
"calib/mu_w": 0.6183,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.13662601626016257,
"calib/std_conf": 0.2594485659967677,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.6530057803468208,
"calib/step_q_c_n": 692.0,
"calib/step_q_gap": 0.07198021400377874,
"calib/step_q_w": 0.581025566343042,
"calib/step_q_w_n": 618.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2385.0,
"completions/max_terminated_length": 2385.0,
"completions/mean_length": 501.19140625,
"completions/mean_terminated_length": 503.1568908691406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.07743779569864273,
"kl": 0.046253204345703125,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0019,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03694310039281845,
"mask/share_reasoning": 0.8424147963523865,
"mask/share_step_conf": 0.11673584580421448,
"num_tokens": 16488271.0,
"reward": 0.9158475399017334,
"reward_std": 0.21296223998069763,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/asymmetric_l2_reward": 0.8014019727706909,
"rewards/final_brier_reward_step": 0.7256054878234863,
"rewards/format_reward_step": 0.953125,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.7655402421951294,
"adv/mean_abs_reasoning": 0.5517368316650391,
"adv/mean_abs_step_conf": 0.7545615434646606,
"adv/ratio_final_to_reasoning": 1.3875097659963562,
"adv/ratio_step_to_reasoning": 1.3676113323584622,
"adv/std_final_conf": 0.9263461232185364,
"adv/std_reasoning": 0.792895495891571,
"adv/std_step_conf": 0.9345738887786865,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6088992974238876,
"calib/avg_num_step_conf": 4.86328125,
"calib/ece": 0.19758064516129042,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.2862903225806452,
"calib/gap": 0.10129065833983864,
"calib/mean_conf": 0.6591935483870969,
"calib/mu_c": 0.7106557377049181,
"calib/mu_w": 0.6093650793650794,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.18241935483870975,
"calib/std_conf": 0.24994224098648157,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6377577933450087,
"calib/step_q_c_n": 571.0,
"calib/step_q_gap": 0.03260541945776829,
"calib/step_q_w": 0.6051523738872404,
"calib/step_q_w_n": 674.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2589.0,
"completions/max_terminated_length": 2589.0,
"completions/mean_length": 565.609375,
"completions/mean_terminated_length": 567.8275146484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.0736,
"grad_norm": 0.04150233417749405,
"kl": 0.0474853515625,
"learning_rate": 3.638888888888889e-06,
"loss": -0.0577,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03185002878308296,
"mask/share_reasoning": 0.8687409162521362,
"mask/share_step_conf": 0.09550271928310394,
"num_tokens": 16737563.0,
"reward": 0.8744354844093323,
"reward_std": 0.19976764917373657,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/asymmetric_l2_reward": 0.7783041596412659,
"rewards/final_brier_reward_step": 0.6830667853355408,
"rewards/format_reward_step": 0.9609375,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.7534902691841125,
"adv/mean_abs_reasoning": 0.4766578674316406,
"adv/mean_abs_step_conf": 0.7700801491737366,
"adv/ratio_final_to_reasoning": 1.5807779975270702,
"adv/ratio_step_to_reasoning": 1.6155825840516034,
"adv/std_final_conf": 0.9181990027427673,
"adv/std_reasoning": 0.720661461353302,
"adv/std_step_conf": 0.9350027441978455,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.8443896507464703,
"calib/avg_num_step_conf": 5.5859375,
"calib/ece": 0.20327868852459027,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.3524590163934426,
"calib/gap": 0.3460663379044788,
"calib/mean_conf": 0.6605737704918033,
"calib/mu_c": 0.8463716814159292,
"calib/mu_w": 0.5003053435114504,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2003688524590165,
"calib/std_conf": 0.28365674008783714,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6533443708609271,
"calib/step_q_c_n": 604.0,
"calib/step_q_gap": 0.07408964931129025,
"calib/step_q_w": 0.5792547215496369,
"calib/step_q_w_n": 826.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2927.0,
"completions/max_terminated_length": 2927.0,
"completions/mean_length": 599.86328125,
"completions/mean_terminated_length": 599.86328125,
"completions/min_length": 106.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.04824332147836685,
"kl": 0.04335784912109375,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.0352,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.033216990530490875,
"mask/share_reasoning": 0.8483107686042786,
"mask/share_step_conf": 0.11847224086523056,
"num_tokens": 16998120.0,
"reward": 0.8946191072463989,
"reward_std": 0.18673905730247498,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/asymmetric_l2_reward": 0.7578139305114746,
"rewards/final_brier_reward_step": 0.7556430101394653,
"rewards/format_reward_step": 0.9375,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.7033551931381226,
"adv/mean_abs_reasoning": 0.5371626615524292,
"adv/mean_abs_step_conf": 0.746070146560669,
"adv/ratio_final_to_reasoning": 1.3093895824877104,
"adv/ratio_step_to_reasoning": 1.3889091702771854,
"adv/std_final_conf": 0.8913128972053528,
"adv/std_reasoning": 0.7754039168357849,
"adv/std_step_conf": 0.9348368048667908,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6245294234198534,
"calib/avg_num_step_conf": 5.86328125,
"calib/ece": 0.22508000000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.472,
"calib/gap": 0.11424278449243785,
"calib/mean_conf": 0.723,
"calib/mu_c": 0.7700680272108843,
"calib/mu_w": 0.6558252427184464,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.18004000000000003,
"calib/std_conf": 0.28955034104625055,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6275355670103093,
"calib/step_q_c_n": 776.0,
"calib/step_q_gap": 0.03587267045858522,
"calib/step_q_w": 0.5916628965517241,
"calib/step_q_w_n": 725.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2462.0,
"completions/max_terminated_length": 2462.0,
"completions/mean_length": 533.4921875,
"completions/mean_terminated_length": 535.5843505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.036347754299640656,
"kl": 0.049762725830078125,
"learning_rate": 3.5833333333333335e-06,
"loss": -0.0425,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.0346219427883625,
"mask/share_reasoning": 0.8352914452552795,
"mask/share_step_conf": 0.12618035078048706,
"num_tokens": 17239102.0,
"reward": 0.8964895009994507,
"reward_std": 0.19243742525577545,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/asymmetric_l2_reward": 0.7922806143760681,
"rewards/final_brier_reward_step": 0.691323459148407,
"rewards/format_reward_step": 0.97265625,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.6750290393829346,
"adv/mean_abs_reasoning": 0.45832559466362,
"adv/mean_abs_step_conf": 0.7397451400756836,
"adv/ratio_final_to_reasoning": 1.4728154989431919,
"adv/ratio_step_to_reasoning": 1.614016648183496,
"adv/std_final_conf": 0.8813406229019165,
"adv/std_reasoning": 0.7574042081832886,
"adv/std_step_conf": 0.935146689414978,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7351403061224491,
"calib/avg_num_step_conf": 4.8671875,
"calib/ece": 0.21115079365079364,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.47619047619047616,
"calib/gap": 0.22401785714285705,
"calib/mean_conf": 0.7532936507936507,
"calib/mu_c": 0.8528571428571429,
"calib/mu_w": 0.6288392857142858,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.20444444444444443,
"calib/std_conf": 0.2702509582515445,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.647175965665236,
"calib/step_q_c_n": 699.0,
"calib/step_q_gap": 0.05842441173470592,
"calib/step_q_w": 0.5887515539305301,
"calib/step_q_w_n": 547.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2390.0,
"completions/max_terminated_length": 2390.0,
"completions/mean_length": 483.91015625,
"completions/mean_terminated_length": 483.91015625,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.0768,
"grad_norm": 0.31355804204940796,
"kl": 0.11542510986328125,
"learning_rate": 3.555555555555556e-06,
"loss": 0.0661,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.034122809767723083,
"mask/share_reasoning": 0.8555728197097778,
"mask/share_step_conf": 0.11030436307191849,
"num_tokens": 17467391.0,
"reward": 0.9290406107902527,
"reward_std": 0.1833093762397766,
"rewards/accuracy_reward_step": 0.546875,
"rewards/asymmetric_l2_reward": 0.8128567934036255,
"rewards/final_brier_reward_step": 0.7397554516792297,
"rewards/format_reward_step": 0.98046875,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.6979169249534607,
"adv/mean_abs_reasoning": 0.611485481262207,
"adv/mean_abs_step_conf": 0.7284319400787354,
"adv/ratio_final_to_reasoning": 1.1413466817116327,
"adv/ratio_step_to_reasoning": 1.191249771908127,
"adv/std_final_conf": 0.8787330389022827,
"adv/std_reasoning": 0.8267027735710144,
"adv/std_step_conf": 0.9347501993179321,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.765242718446602,
"calib/avg_num_step_conf": 5.03125,
"calib/ece": 0.2145454545454545,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5889328063241107,
"calib/gap": 0.232482200647249,
"calib/mean_conf": 0.7950197628458499,
"calib/mu_c": 0.8896666666666666,
"calib/mu_w": 0.6571844660194176,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.20833992094861656,
"calib/std_conf": 0.2677967852430365,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6364171390013496,
"calib/step_q_c_n": 741.0,
"calib/step_q_gap": 0.059897394942848625,
"calib/step_q_w": 0.5765197440585009,
"calib/step_q_w_n": 547.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2415.0,
"completions/max_terminated_length": 2415.0,
"completions/mean_length": 468.234375,
"completions/mean_terminated_length": 468.234375,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.027293583378195763,
"kl": 0.050006866455078125,
"learning_rate": 3.5277777777777784e-06,
"loss": -0.0106,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0346967875957489,
"mask/share_reasoning": 0.8508319854736328,
"mask/share_step_conf": 0.11447125673294067,
"num_tokens": 17694291.0,
"reward": 0.9352380037307739,
"reward_std": 0.20372015237808228,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/asymmetric_l2_reward": 0.8135044574737549,
"rewards/final_brier_reward_step": 0.7436902523040771,
"rewards/format_reward_step": 0.98046875,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.6834238767623901,
"adv/mean_abs_reasoning": 0.4758460521697998,
"adv/mean_abs_step_conf": 0.7555922269821167,
"adv/ratio_final_to_reasoning": 1.4362289518764753,
"adv/ratio_step_to_reasoning": 1.587892183904245,
"adv/std_final_conf": 0.883694052696228,
"adv/std_reasoning": 0.7206491827964783,
"adv/std_step_conf": 0.92009037733078,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.741551724137931,
"calib/avg_num_step_conf": 4.6953125,
"calib/ece": 0.22875518672199174,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.4730290456431535,
"calib/gap": 0.2606606896551724,
"calib/mean_conf": 0.7315767634854772,
"calib/mu_c": 0.85704,
"calib/mu_w": 0.5963793103448276,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.94921875,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.22082987551867225,
"calib/std_conf": 0.3010369411849477,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_c": 0.6306426644182125,
"calib/step_q_c_n": 593.0,
"calib/step_q_gap": 0.06642263157748995,
"calib/step_q_w": 0.5642200328407225,
"calib/step_q_w_n": 609.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2564.0,
"completions/max_terminated_length": 2564.0,
"completions/mean_length": 521.0859375,
"completions/mean_terminated_length": 523.1294555664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.0466296449303627,
"kl": 0.0598602294921875,
"learning_rate": 3.5e-06,
"loss": 0.0043,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.0345626175403595,
"mask/share_reasoning": 0.8509548306465149,
"mask/share_step_conf": 0.1105763390660286,
"num_tokens": 17931617.0,
"reward": 0.8789986968040466,
"reward_std": 0.19282305240631104,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/asymmetric_l2_reward": 0.7778134346008301,
"rewards/final_brier_reward_step": 0.6950277090072632,
"rewards/format_reward_step": 0.93359375,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.628893256187439,
"adv/mean_abs_reasoning": 0.4681423604488373,
"adv/mean_abs_step_conf": 0.7405921816825867,
"adv/ratio_final_to_reasoning": 1.3433803674260107,
"adv/ratio_step_to_reasoning": 1.581980705554043,
"adv/std_final_conf": 0.8261920809745789,
"adv/std_reasoning": 0.7575705051422119,
"adv/std_step_conf": 0.9339452981948853,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8038342655284075,
"calib/avg_num_step_conf": 4.921875,
"calib/ece": 0.18517928286852595,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.7330677290836654,
"calib/gap": 0.26919782160730044,
"calib/mean_conf": 0.8500398406374503,
"calib/mu_c": 0.934767441860465,
"calib/mu_w": 0.6655696202531646,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.17498007968127494,
"calib/std_conf": 0.2661478156350438,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.6109479191438763,
"calib/step_q_c_n": 841.0,
"calib/step_q_gap": 0.051716654227408454,
"calib/step_q_w": 0.5592312649164678,
"calib/step_q_w_n": 419.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2218.0,
"completions/max_terminated_length": 2218.0,
"completions/mean_length": 469.1328125,
"completions/mean_terminated_length": 469.1328125,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.08,
"grad_norm": 0.02828267775475979,
"kl": 0.0692291259765625,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.0302,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.035453006625175476,
"mask/share_reasoning": 0.8479112386703491,
"mask/share_step_conf": 0.1166357472538948,
"num_tokens": 18156467.0,
"reward": 0.9559888243675232,
"reward_std": 0.22168438136577606,
"rewards/accuracy_reward_step": 0.671875,
"rewards/asymmetric_l2_reward": 0.8216685652732849,
"rewards/final_brier_reward_step": 0.7645277380943298,
"rewards/format_reward_step": 0.95703125,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.622812032699585,
"adv/mean_abs_reasoning": 0.5359321236610413,
"adv/mean_abs_step_conf": 0.7628259062767029,
"adv/ratio_final_to_reasoning": 1.1621099113168523,
"adv/ratio_step_to_reasoning": 1.4233629084700363,
"adv/std_final_conf": 0.8540507555007935,
"adv/std_reasoning": 0.7928803563117981,
"adv/std_step_conf": 0.9348656535148621,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7480906148867315,
"calib/avg_num_step_conf": 4.6171875,
"calib/ece": 0.25411067193675896,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6956521739130435,
"calib/gap": 0.22156634304207123,
"calib/mean_conf": 0.8291304347826086,
"calib/mu_c": 0.9193333333333332,
"calib/mu_w": 0.697766990291262,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.24517786561264823,
"calib/std_conf": 0.2779797863926332,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.595253709198813,
"calib/step_q_c_n": 674.0,
"calib/step_q_gap": 0.01054898478936428,
"calib/step_q_w": 0.5847047244094488,
"calib/step_q_w_n": 508.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1479.0,
"completions/max_terminated_length": 1479.0,
"completions/mean_length": 486.13671875,
"completions/mean_terminated_length": 488.04315185546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.043174050748348236,
"kl": 0.0677642822265625,
"learning_rate": 3.444444444444445e-06,
"loss": -0.0746,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03488195687532425,
"mask/share_reasoning": 0.8566423058509827,
"mask/share_step_conf": 0.10456950962543488,
"num_tokens": 18383974.0,
"reward": 0.9037140607833862,
"reward_std": 0.22321432828903198,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/asymmetric_l2_reward": 0.7896616458892822,
"rewards/final_brier_reward_step": 0.7076101303100586,
"rewards/format_reward_step": 0.96484375,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.6962723731994629,
"adv/mean_abs_reasoning": 0.5586796998977661,
"adv/mean_abs_step_conf": 0.7595263123512268,
"adv/ratio_final_to_reasoning": 1.2462818558234263,
"adv/ratio_step_to_reasoning": 1.359502255926274,
"adv/std_final_conf": 0.863667905330658,
"adv/std_reasoning": 0.7929328680038452,
"adv/std_step_conf": 0.9350054860115051,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6630559540889527,
"calib/avg_num_step_conf": 5.11328125,
"calib/ece": 0.22903614457831326,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.6305220883534136,
"calib/gap": 0.147654949784792,
"calib/mean_conf": 0.8014859437751004,
"calib/mu_c": 0.851890243902439,
"calib/mu_w": 0.704235294117647,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.1859437751004016,
"calib/std_conf": 0.29090324783614796,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.5586034255599472,
"calib/step_q_c_n": 759.0,
"calib/step_q_gap": 0.03651251646903819,
"calib/step_q_w": 0.522090909090909,
"calib/step_q_w_n": 550.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3045.0,
"completions/max_terminated_length": 3045.0,
"completions/mean_length": 512.046875,
"completions/mean_terminated_length": 512.046875,
"completions/min_length": 107.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.025604577735066414,
"kl": 0.06995391845703125,
"learning_rate": 3.416666666666667e-06,
"loss": -0.0046,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03763299062848091,
"mask/share_reasoning": 0.8435276746749878,
"mask/share_step_conf": 0.11883929371833801,
"num_tokens": 18619722.0,
"reward": 0.9079852104187012,
"reward_std": 0.24078664183616638,
"rewards/accuracy_reward_step": 0.640625,
"rewards/asymmetric_l2_reward": 0.8027098774909973,
"rewards/final_brier_reward_step": 0.696854293346405,
"rewards/format_reward_step": 0.94140625,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.7457473278045654,
"adv/mean_abs_reasoning": 0.6220015287399292,
"adv/mean_abs_step_conf": 0.7352344989776611,
"adv/ratio_final_to_reasoning": 1.1989477410374287,
"adv/ratio_step_to_reasoning": 1.1820461285153478,
"adv/std_final_conf": 0.9125654101371765,
"adv/std_reasoning": 0.8429985642433167,
"adv/std_step_conf": 0.9346243143081665,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6882824726134585,
"calib/avg_num_step_conf": 5.11328125,
"calib/ece": 0.2450129333333334,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.572,
"calib/gap": 0.196561737089202,
"calib/mean_conf": 0.7558137333333333,
"calib/mu_c": 0.8407284037558685,
"calib/mu_w": 0.6441666666666666,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.2164133333333334,
"calib/std_conf": 0.3190380118248817,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.5449234693877552,
"calib/step_q_c_n": 784.0,
"calib/step_q_gap": 0.02516442176870748,
"calib/step_q_w": 0.5197590476190477,
"calib/step_q_w_n": 525.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1655.0,
"completions/max_terminated_length": 1655.0,
"completions/mean_length": 531.875,
"completions/mean_terminated_length": 533.9608154296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.0832,
"grad_norm": 0.02739427238702774,
"kl": 0.0735321044921875,
"learning_rate": 3.3888888888888893e-06,
"loss": -0.1401,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03082401677966118,
"mask/share_reasoning": 0.8617717027664185,
"mask/share_step_conf": 0.10349804162979126,
"num_tokens": 18863906.0,
"reward": 0.8943131566047668,
"reward_std": 0.24572047591209412,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/asymmetric_l2_reward": 0.8101315498352051,
"rewards/final_brier_reward_step": 0.6777135133743286,
"rewards/format_reward_step": 0.94921875,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.6829380989074707,
"adv/mean_abs_reasoning": 0.46346795558929443,
"adv/mean_abs_step_conf": 0.7522290945053101,
"adv/ratio_final_to_reasoning": 1.47353898078913,
"adv/ratio_step_to_reasoning": 1.6230444530924668,
"adv/std_final_conf": 0.8693846464157104,
"adv/std_reasoning": 0.7205665707588196,
"adv/std_step_conf": 0.9347177147865295,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6895833333333334,
"calib/avg_num_step_conf": 5.640625,
"calib/ece": 0.24385826771653552,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6417322834645669,
"calib/gap": 0.20205384615384603,
"calib/mean_conf": 0.7956692913385828,
"calib/mu_c": 0.8784,
"calib/mu_w": 0.6763461538461539,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22448818897637804,
"calib/std_conf": 0.3087352675593296,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5525216316440049,
"calib/step_q_c_n": 809.0,
"calib/step_q_gap": 0.06615942691959542,
"calib/step_q_w": 0.4863622047244095,
"calib/step_q_w_n": 635.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1882.0,
"completions/max_terminated_length": 1882.0,
"completions/mean_length": 510.8671875,
"completions/mean_terminated_length": 514.8897705078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.031064137816429138,
"kl": 0.07048797607421875,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0686,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.033156618475914,
"mask/share_reasoning": 0.8510840535163879,
"mask/share_step_conf": 0.10794685781002045,
"num_tokens": 19101064.0,
"reward": 0.9335497617721558,
"reward_std": 0.19944220781326294,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/asymmetric_l2_reward": 0.847852885723114,
"rewards/final_brier_reward_step": 0.7051839828491211,
"rewards/format_reward_step": 0.984375,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.6879172325134277,
"adv/mean_abs_reasoning": 0.5587252378463745,
"adv/mean_abs_step_conf": 0.7600916028022766,
"adv/ratio_final_to_reasoning": 1.231226345108426,
"adv/ratio_step_to_reasoning": 1.3604032023541224,
"adv/std_final_conf": 0.8734791278839111,
"adv/std_reasoning": 0.7755234241485596,
"adv/std_step_conf": 0.9350786805152893,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7002287581699347,
"calib/avg_num_step_conf": 5.828125,
"calib/ece": 0.2808827404479579,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.782608695652174,
"calib/gap": 0.18351067538126353,
"calib/mean_conf": 0.876376811594203,
"calib/mu_c": 0.9489106753812636,
"calib/mu_w": 0.7654000000000001,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.27625823451910414,
"calib/std_conf": 0.2520777155119635,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5498933962264151,
"calib/step_q_c_n": 848.0,
"calib/step_q_gap": 0.07029712293448959,
"calib/step_q_w": 0.4795962732919255,
"calib/step_q_w_n": 644.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1319.0,
"completions/max_terminated_length": 1319.0,
"completions/mean_length": 464.578125,
"completions/mean_terminated_length": 466.4000244140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.025256939232349396,
"kl": 0.0876312255859375,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.0673,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03492242097854614,
"mask/share_reasoning": 0.8304387331008911,
"mask/share_step_conf": 0.13073261082172394,
"num_tokens": 19322156.0,
"reward": 0.9114360809326172,
"reward_std": 0.23360256850719452,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/asymmetric_l2_reward": 0.8124538660049438,
"rewards/final_brier_reward_step": 0.6955744028091431,
"rewards/format_reward_step": 0.9765625,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.7039898633956909,
"adv/mean_abs_reasoning": 0.5247969627380371,
"adv/mean_abs_step_conf": 0.7595534324645996,
"adv/ratio_final_to_reasoning": 1.341451863064805,
"adv/ratio_step_to_reasoning": 1.4473281790766497,
"adv/std_final_conf": 0.8915676474571228,
"adv/std_reasoning": 0.7928330898284912,
"adv/std_step_conf": 0.9346292018890381,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6952564809707666,
"calib/avg_num_step_conf": 5.0703125,
"calib/ece": 0.2584027100271004,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.6422764227642277,
"calib/gap": 0.22656881779738924,
"calib/mean_conf": 0.7620579945799457,
"calib/mu_c": 0.8523171171171171,
"calib/mu_w": 0.6257482993197279,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.20941734417344188,
"calib/std_conf": 0.3406476325655169,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5421875,
"calib/step_q_c_n": 608.0,
"calib/step_q_gap": 0.10437039855072472,
"calib/step_q_w": 0.4378171014492753,
"calib/step_q_w_n": 690.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2644.0,
"completions/max_terminated_length": 2644.0,
"completions/mean_length": 529.484375,
"completions/mean_terminated_length": 529.484375,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.0864,
"grad_norm": 0.03870345279574394,
"kl": 0.07418060302734375,
"learning_rate": 3.3055555555555558e-06,
"loss": 0.0557,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.035466842353343964,
"mask/share_reasoning": 0.8551498055458069,
"mask/share_step_conf": 0.10938338935375214,
"num_tokens": 19563952.0,
"reward": 0.8947978019714355,
"reward_std": 0.23964188992977142,
"rewards/accuracy_reward_step": 0.578125,
"rewards/asymmetric_l2_reward": 0.8084914684295654,
"rewards/final_brier_reward_step": 0.6787604093551636,
"rewards/format_reward_step": 0.93359375,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.6892867088317871,
"adv/mean_abs_reasoning": 0.49717363715171814,
"adv/mean_abs_step_conf": 0.7668944597244263,
"adv/ratio_final_to_reasoning": 1.3864104154449435,
"adv/ratio_step_to_reasoning": 1.5425082957292842,
"adv/std_final_conf": 0.8581146001815796,
"adv/std_reasoning": 0.7393258810043335,
"adv/std_step_conf": 0.9347033500671387,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6737952575216726,
"calib/avg_num_step_conf": 5.015625,
"calib/ece": 0.2896194225721785,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.7401574803149606,
"calib/gap": 0.1977885432602413,
"calib/mean_conf": 0.8453412073490814,
"calib/mu_c": 0.9278828828828828,
"calib/mu_w": 0.7300943396226415,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.2761417322834646,
"calib/std_conf": 0.2911954452589784,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5398409090909091,
"calib/step_q_c_n": 704.0,
"calib/step_q_gap": 0.06357194357366774,
"calib/step_q_w": 0.47626896551724135,
"calib/step_q_w_n": 580.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2208.0,
"completions/max_terminated_length": 2208.0,
"completions/mean_length": 454.90234375,
"completions/mean_terminated_length": 454.90234375,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.02759123221039772,
"kl": 0.09171295166015625,
"learning_rate": 3.277777777777778e-06,
"loss": 0.0137,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.037247899919748306,
"mask/share_reasoning": 0.8433917760848999,
"mask/share_step_conf": 0.11936035752296448,
"num_tokens": 19785959.0,
"reward": 0.908469557762146,
"reward_std": 0.21961763501167297,
"rewards/accuracy_reward_step": 0.578125,
"rewards/asymmetric_l2_reward": 0.8281220197677612,
"rewards/final_brier_reward_step": 0.6786607503890991,
"rewards/format_reward_step": 0.97265625,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.6679799556732178,
"adv/mean_abs_reasoning": 0.5168710350990295,
"adv/mean_abs_step_conf": 0.7405879497528076,
"adv/ratio_final_to_reasoning": 1.2923532376799498,
"adv/ratio_step_to_reasoning": 1.4328292735748196,
"adv/std_final_conf": 0.8723222613334656,
"adv/std_reasoning": 0.7929161190986633,
"adv/std_step_conf": 0.9351591467857361,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7158919511860689,
"calib/avg_num_step_conf": 5.39453125,
"calib/ece": 0.2495102040816326,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.6775510204081633,
"calib/gap": 0.2795797339914986,
"calib/mean_conf": 0.7936734693877551,
"calib/mu_c": 0.91006993006993,
"calib/mu_w": 0.6304901960784314,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9375,
"calib/pce": 0.22975510204081628,
"calib/std_conf": 0.3341653866316157,
"calib/step_conf_rate": 0.9375,
"calib/step_q_c": 0.48273054054054054,
"calib/step_q_c_n": 740.0,
"calib/step_q_gap": 0.04936725929054059,
"calib/step_q_w": 0.43336328124999995,
"calib/step_q_w_n": 640.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2648.0,
"completions/max_terminated_length": 2648.0,
"completions/mean_length": 554.6796875,
"completions/mean_terminated_length": 556.8549194335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.03267595171928406,
"kl": 0.07877349853515625,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.0906,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03222406283020973,
"mask/share_reasoning": 0.8548700213432312,
"mask/share_step_conf": 0.10899969935417175,
"num_tokens": 20035221.0,
"reward": 0.8798561096191406,
"reward_std": 0.2699333727359772,
"rewards/accuracy_reward_step": 0.5625,
"rewards/asymmetric_l2_reward": 0.7940636277198792,
"rewards/final_brier_reward_step": 0.6711171865463257,
"rewards/format_reward_step": 0.91015625,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.711301326751709,
"adv/mean_abs_reasoning": 0.5254456996917725,
"adv/mean_abs_step_conf": 0.7322558164596558,
"adv/ratio_final_to_reasoning": 1.3537104351010196,
"adv/ratio_step_to_reasoning": 1.393589893093042,
"adv/std_final_conf": 0.8830074071884155,
"adv/std_reasoning": 0.792775571346283,
"adv/std_step_conf": 0.934762716293335,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7625246548323472,
"calib/avg_num_step_conf": 4.6953125,
"calib/ece": 0.29368421052631577,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.680161943319838,
"calib/gap": 0.274820512820513,
"calib/mean_conf": 0.785668016194332,
"calib/mu_c": 0.9158461538461539,
"calib/mu_w": 0.6410256410256409,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.27651821862348175,
"calib/std_conf": 0.3377123849800877,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.49211437403400315,
"calib/step_q_c_n": 647.0,
"calib/step_q_gap": 0.05764590556553467,
"calib/step_q_w": 0.4344684684684685,
"calib/step_q_w_n": 555.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2879.0,
"completions/max_terminated_length": 2879.0,
"completions/mean_length": 498.296875,
"completions/mean_terminated_length": 498.296875,
"completions/min_length": 163.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.0896,
"grad_norm": 0.041336141526699066,
"kl": 0.08129119873046875,
"learning_rate": 3.2222222222222227e-06,
"loss": -0.0067,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.036794595420360565,
"mask/share_reasoning": 0.8531485795974731,
"mask/share_step_conf": 0.11005677282810211,
"num_tokens": 20268705.0,
"reward": 0.8911169767379761,
"reward_std": 0.24814923107624054,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/asymmetric_l2_reward": 0.8201553225517273,
"rewards/final_brier_reward_step": 0.6706722974777222,
"rewards/format_reward_step": 0.94921875,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.7375852465629578,
"adv/mean_abs_reasoning": 0.6083944439888,
"adv/mean_abs_step_conf": 0.7445007562637329,
"adv/ratio_final_to_reasoning": 1.212347111073447,
"adv/ratio_step_to_reasoning": 1.2237139303616626,
"adv/std_final_conf": 0.8906832933425903,
"adv/std_reasoning": 0.8099904656410217,
"adv/std_step_conf": 0.9348547458648682,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.7101584022038567,
"calib/avg_num_step_conf": 5.140625,
"calib/ece": 0.31475795297372056,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.6473029045643154,
"calib/gap": 0.2235241046831956,
"calib/mean_conf": 0.7624757952973721,
"calib/mu_c": 0.8737741046831956,
"calib/mu_w": 0.65025,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.2875795297372061,
"calib/std_conf": 0.3512132715214306,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_c": 0.46668918918918917,
"calib/step_q_c_n": 592.0,
"calib/step_q_gap": 0.060110920312577754,
"calib/step_q_w": 0.4065782688766114,
"calib/step_q_w_n": 724.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2932.0,
"completions/max_terminated_length": 2932.0,
"completions/mean_length": 548.95703125,
"completions/mean_terminated_length": 553.279541015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.03381429985165596,
"kl": 0.07662200927734375,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.0631,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.033809252083301544,
"mask/share_reasoning": 0.844788670539856,
"mask/share_step_conf": 0.11358959972858429,
"num_tokens": 20517062.0,
"reward": 0.848590612411499,
"reward_std": 0.26348742842674255,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/asymmetric_l2_reward": 0.8029472827911377,
"rewards/final_brier_reward_step": 0.6153277158737183,
"rewards/format_reward_step": 0.921875,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.7561108469963074,
"adv/mean_abs_reasoning": 0.4853861927986145,
"adv/mean_abs_step_conf": 0.7464814782142639,
"adv/ratio_final_to_reasoning": 1.5577510407470858,
"adv/ratio_step_to_reasoning": 1.5379124690594097,
"adv/std_final_conf": 0.9270208477973938,
"adv/std_reasoning": 0.7754190564155579,
"adv/std_step_conf": 0.9350854158401489,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6685927067283,
"calib/avg_num_step_conf": 4.6015625,
"calib/ece": 0.31136,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.568,
"calib/gap": 0.23257960965588087,
"calib/mean_conf": 0.69152,
"calib/mu_c": 0.814322033898305,
"calib/mu_w": 0.5817424242424242,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.26544,
"calib/std_conf": 0.3847857710466956,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.46849652777777784,
"calib/step_q_c_n": 576.0,
"calib/step_q_gap": 0.058679252030269524,
"calib/step_q_w": 0.4098172757475083,
"calib/step_q_w_n": 602.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1938.0,
"completions/max_terminated_length": 1938.0,
"completions/mean_length": 488.9609375,
"completions/mean_terminated_length": 490.8784484863281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.044561564922332764,
"kl": 0.08454132080078125,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.0951,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03566384315490723,
"mask/share_reasoning": 0.8496521711349487,
"mask/share_step_conf": 0.11077769100666046,
"num_tokens": 20747748.0,
"reward": 0.8705247640609741,
"reward_std": 0.25272929668426514,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/asymmetric_l2_reward": 0.8211120367050171,
"rewards/final_brier_reward_step": 0.6363437175750732,
"rewards/format_reward_step": 0.95703125,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.7711803913116455,
"adv/mean_abs_reasoning": 0.6064971685409546,
"adv/mean_abs_step_conf": 0.7583389282226562,
"adv/ratio_final_to_reasoning": 1.271531725641635,
"adv/ratio_step_to_reasoning": 1.2503585631685408,
"adv/std_final_conf": 0.9249719381332397,
"adv/std_reasoning": 0.8746480345726013,
"adv/std_step_conf": 0.9346683621406555,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6656976744186047,
"calib/avg_num_step_conf": 5.203125,
"calib/ece": 0.2269611780455154,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5943775100401606,
"calib/gap": 0.2687181616832781,
"calib/mean_conf": 0.6965729585006694,
"calib/mu_c": 0.779670542635659,
"calib/mu_w": 0.5109523809523809,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.1163855421686747,
"calib/std_conf": 0.389896312445251,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.4399458997722096,
"calib/step_q_c_n": 878.0,
"calib/step_q_gap": 0.051287529728156755,
"calib/step_q_w": 0.38865837004405285,
"calib/step_q_w_n": 454.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2910.0,
"completions/max_terminated_length": 2910.0,
"completions/mean_length": 489.95703125,
"completions/mean_terminated_length": 491.8784484863281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.0928,
"grad_norm": 0.04757794737815857,
"kl": 0.0927886962890625,
"learning_rate": 3.138888888888889e-06,
"loss": -0.0186,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.036881767213344574,
"mask/share_reasoning": 0.8420805931091309,
"mask/share_step_conf": 0.11713138222694397,
"num_tokens": 20978673.0,
"reward": 0.947090744972229,
"reward_std": 0.24253800511360168,
"rewards/accuracy_reward_step": 0.671875,
"rewards/asymmetric_l2_reward": 0.8506828546524048,
"rewards/final_brier_reward_step": 0.7169361114501953,
"rewards/format_reward_step": 0.9609375,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.6935921907424927,
"adv/mean_abs_reasoning": 0.47695034742355347,
"adv/mean_abs_step_conf": 0.7630202770233154,
"adv/ratio_final_to_reasoning": 1.4542230538026035,
"adv/ratio_step_to_reasoning": 1.5997897499081155,
"adv/std_final_conf": 0.8939658999443054,
"adv/std_reasoning": 0.7392098903656006,
"adv/std_step_conf": 0.9342235922813416,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7492699596147873,
"calib/avg_num_step_conf": 5.25390625,
"calib/ece": 0.21371054687500007,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.51171875,
"calib/gap": 0.3207307300403851,
"calib/mean_conf": 0.664726953125,
"calib/mu_c": 0.8037937931034482,
"calib/mu_w": 0.4830630630630631,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.15601562500000005,
"calib/std_conf": 0.3798028370295785,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.4338300492610837,
"calib/step_q_c_n": 812.0,
"calib/step_q_gap": 0.04416776033050207,
"calib/step_q_w": 0.38966228893058164,
"calib/step_q_w_n": 533.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1490.0,
"completions/max_terminated_length": 1490.0,
"completions/mean_length": 505.84375,
"completions/mean_terminated_length": 507.8274841308594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.04795219376683235,
"kl": 0.08171844482421875,
"learning_rate": 3.1111111111111116e-06,
"loss": -0.0578,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03347531333565712,
"mask/share_reasoning": 0.8500121831893921,
"mask/share_step_conf": 0.11260630190372467,
"num_tokens": 21218017.0,
"reward": 0.950875461101532,
"reward_std": 0.1858539879322052,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/asymmetric_l2_reward": 0.8525054454803467,
"rewards/final_brier_reward_step": 0.740651547908783,
"rewards/format_reward_step": 0.9765625,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.7229398488998413,
"adv/mean_abs_reasoning": 0.47888630628585815,
"adv/mean_abs_step_conf": 0.743857204914093,
"adv/ratio_final_to_reasoning": 1.5096273153158446,
"adv/ratio_step_to_reasoning": 1.5533064845459743,
"adv/std_final_conf": 0.891949474811554,
"adv/std_reasoning": 0.75757896900177,
"adv/std_step_conf": 0.9348465204238892,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7704152467499685,
"calib/avg_num_step_conf": 5.07421875,
"calib/ece": 0.1756086956521739,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.43478260869565216,
"calib/gap": 0.37849211157389884,
"calib/mean_conf": 0.5794901185770751,
"calib/mu_c": 0.7500359712230216,
"calib/mu_w": 0.37154385964912273,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.1028458498023715,
"calib/std_conf": 0.4025525435239658,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.44734042553191494,
"calib/step_q_c_n": 658.0,
"calib/step_q_gap": 0.09799409167856082,
"calib/step_q_w": 0.3493463338533541,
"calib/step_q_w_n": 641.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2286.0,
"completions/max_terminated_length": 2286.0,
"completions/mean_length": 525.1953125,
"completions/mean_terminated_length": 525.1953125,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.033446215093135834,
"kl": 0.0828399658203125,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.0353,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03558123856782913,
"mask/share_reasoning": 0.8521634936332703,
"mask/share_step_conf": 0.11225523054599762,
"num_tokens": 21461355.0,
"reward": 0.9454695582389832,
"reward_std": 0.22363021969795227,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/asymmetric_l2_reward": 0.8446345329284668,
"rewards/final_brier_reward_step": 0.7455234527587891,
"rewards/format_reward_step": 0.9609375,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.7110995054244995,
"adv/mean_abs_reasoning": 0.510982096195221,
"adv/mean_abs_step_conf": 0.7581682205200195,
"adv/ratio_final_to_reasoning": 1.3916329177075974,
"adv/ratio_step_to_reasoning": 1.4837471335401955,
"adv/std_final_conf": 0.893814206123352,
"adv/std_reasoning": 0.7575870156288147,
"adv/std_step_conf": 0.933884859085083,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7209821428571428,
"calib/avg_num_step_conf": 6.0859375,
"calib/ece": 0.21139442231075695,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.545816733067729,
"calib/gap": 0.3062506868131869,
"calib/mean_conf": 0.678406374501992,
"calib/mu_c": 0.7894375,
"calib/mu_w": 0.4831868131868131,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.12617529880478087,
"calib/std_conf": 0.3872137809409691,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4215166484118291,
"calib/step_q_c_n": 913.0,
"calib/step_q_gap": 0.06626083445834074,
"calib/step_q_w": 0.3552558139534884,
"calib/step_q_w_n": 645.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2538.0,
"completions/max_terminated_length": 2538.0,
"completions/mean_length": 533.96484375,
"completions/mean_terminated_length": 533.96484375,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.096,
"grad_norm": 0.029829688370227814,
"kl": 0.0811309814453125,
"learning_rate": 3.055555555555556e-06,
"loss": 0.0606,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03403983637690544,
"mask/share_reasoning": 0.8410810828208923,
"mask/share_step_conf": 0.12487903982400894,
"num_tokens": 21701370.0,
"reward": 0.9619349241256714,
"reward_std": 0.1955302506685257,
"rewards/accuracy_reward_step": 0.625,
"rewards/asymmetric_l2_reward": 0.8681378364562988,
"rewards/final_brier_reward_step": 0.736200749874115,
"rewards/format_reward_step": 0.97265625,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.7319117784500122,
"adv/mean_abs_reasoning": 0.5304526090621948,
"adv/mean_abs_step_conf": 0.7541947364807129,
"adv/ratio_final_to_reasoning": 1.3797873098295885,
"adv/ratio_step_to_reasoning": 1.4217947533787785,
"adv/std_final_conf": 0.8858980536460876,
"adv/std_reasoning": 0.7927713394165039,
"adv/std_step_conf": 0.9336501359939575,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7143346346140675,
"calib/avg_num_step_conf": 6.0,
"calib/ece": 0.23464520000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.52,
"calib/gap": 0.28583001849188405,
"calib/mean_conf": 0.6415948000000001,
"calib/mu_c": 0.7479235668789809,
"calib/mu_w": 0.4620935483870968,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12412000000000004,
"calib/std_conf": 0.4015845626676404,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4017911318553092,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": 0.06388833362261404,
"calib/step_q_w": 0.33790279823269515,
"calib/step_q_w_n": 679.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2684.0,
"completions/max_terminated_length": 2684.0,
"completions/mean_length": 534.22265625,
"completions/mean_terminated_length": 540.5573120117188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 208.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.04101370647549629,
"kl": 0.0859375,
"learning_rate": 3.0277777777777776e-06,
"loss": -0.0541,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03051559254527092,
"mask/share_reasoning": 0.8473619222640991,
"mask/share_step_conf": 0.11040370911359787,
"num_tokens": 21945843.0,
"reward": 0.9472370147705078,
"reward_std": 0.20020480453968048,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/asymmetric_l2_reward": 0.8600229024887085,
"rewards/final_brier_reward_step": 0.7172636389732361,
"rewards/format_reward_step": 0.97265625,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.6581138372421265,
"adv/mean_abs_reasoning": 0.5041226744651794,
"adv/mean_abs_step_conf": 0.7552310228347778,
"adv/ratio_final_to_reasoning": 1.3054636710009428,
"adv/ratio_step_to_reasoning": 1.4981096092057309,
"adv/std_final_conf": 0.8489682674407959,
"adv/std_reasoning": 0.7752818465232849,
"adv/std_step_conf": 0.933874785900116,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7361003611971104,
"calib/avg_num_step_conf": 5.03515625,
"calib/ece": 0.21250656167979007,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5984251968503937,
"calib/gap": 0.2983836429308565,
"calib/mean_conf": 0.7195406824146982,
"calib/mu_c": 0.8393640350877193,
"calib/mu_w": 0.5409803921568628,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1668110236220473,
"calib/std_conf": 0.36532456459468643,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4126775956284153,
"calib/step_q_c_n": 732.0,
"calib/step_q_gap": 0.06470632094259843,
"calib/step_q_w": 0.34797127468581684,
"calib/step_q_w_n": 557.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1995.0,
"completions/max_terminated_length": 1995.0,
"completions/mean_length": 476.046875,
"completions/mean_terminated_length": 476.046875,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.030025403946638107,
"kl": 0.0915374755859375,
"learning_rate": 3e-06,
"loss": -0.0822,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03648817166686058,
"mask/share_reasoning": 0.8472324013710022,
"mask/share_step_conf": 0.11627940833568573,
"num_tokens": 22174431.0,
"reward": 0.9668546915054321,
"reward_std": 0.18710312247276306,
"rewards/accuracy_reward_step": 0.59375,
"rewards/asymmetric_l2_reward": 0.870896577835083,
"rewards/final_brier_reward_step": 0.7471877336502075,
"rewards/format_reward_step": 0.984375,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.6915310025215149,
"adv/mean_abs_reasoning": 0.5947073698043823,
"adv/mean_abs_step_conf": 0.7633007764816284,
"adv/ratio_final_to_reasoning": 1.1628088663992526,
"adv/ratio_step_to_reasoning": 1.28348968793291,
"adv/std_final_conf": 0.8915781378746033,
"adv/std_reasoning": 0.8265567421913147,
"adv/std_step_conf": 0.9339955449104309,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6820295846521542,
"calib/avg_num_step_conf": 6.15625,
"calib/ece": 0.27851960000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.504,
"calib/gap": 0.23995874297526,
"calib/mean_conf": 0.6413204,
"calib/mu_c": 0.7497817518248175,
"calib/mu_w": 0.5098230088495576,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.18592000000000006,
"calib/std_conf": 0.40051339126655927,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4214962078651686,
"calib/step_q_c_n": 712.0,
"calib/step_q_gap": 0.1148526893466501,
"calib/step_q_w": 0.3066435185185185,
"calib/step_q_w_n": 864.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2548.0,
"completions/max_terminated_length": 2548.0,
"completions/mean_length": 537.7578125,
"completions/mean_terminated_length": 539.86669921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.0992,
"grad_norm": 0.027518408372998238,
"kl": 0.081146240234375,
"learning_rate": 2.9722222222222225e-06,
"loss": 0.0065,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03318294137716293,
"mask/share_reasoning": 0.8384314775466919,
"mask/share_step_conf": 0.12447934597730637,
"num_tokens": 22417873.0,
"reward": 0.9200072288513184,
"reward_std": 0.20049725472927094,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/asymmetric_l2_reward": 0.8553484678268433,
"rewards/final_brier_reward_step": 0.6831035017967224,
"rewards/format_reward_step": 0.97265625,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.6092467904090881,
"adv/mean_abs_reasoning": 0.5352387428283691,
"adv/mean_abs_step_conf": 0.7489203214645386,
"adv/ratio_final_to_reasoning": 1.1382710959779132,
"adv/ratio_step_to_reasoning": 1.3992266656688734,
"adv/std_final_conf": 0.83155757188797,
"adv/std_reasoning": 0.7928059697151184,
"adv/std_step_conf": 0.9342983961105347,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7352882966090515,
"calib/avg_num_step_conf": 5.47265625,
"calib/ece": 0.1942570281124499,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6144578313253012,
"calib/gap": 0.34813497822931794,
"calib/mean_conf": 0.7179518072289156,
"calib/mu_c": 0.8661538461538462,
"calib/mu_w": 0.5180188679245282,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1689558232931728,
"calib/std_conf": 0.37905852205075696,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4239228723404256,
"calib/step_q_c_n": 752.0,
"calib/step_q_gap": 0.09478573828803727,
"calib/step_q_w": 0.3291371340523883,
"calib/step_q_w_n": 649.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2317.0,
"completions/max_terminated_length": 2317.0,
"completions/mean_length": 487.31640625,
"completions/mean_terminated_length": 491.1535339355469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.04939788579940796,
"kl": 0.08742523193359375,
"learning_rate": 2.944444444444445e-06,
"loss": -0.0123,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03488625958561897,
"mask/share_reasoning": 0.8387618064880371,
"mask/share_step_conf": 0.11853942275047302,
"num_tokens": 22651306.0,
"reward": 0.9398282766342163,
"reward_std": 0.20985865592956543,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/asymmetric_l2_reward": 0.8465824127197266,
"rewards/final_brier_reward_step": 0.7291679382324219,
"rewards/format_reward_step": 0.9609375,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.5665971636772156,
"adv/mean_abs_reasoning": 0.4219573140144348,
"adv/mean_abs_step_conf": 0.7465494871139526,
"adv/ratio_final_to_reasoning": 1.342783131987215,
"adv/ratio_step_to_reasoning": 1.7692535768876703,
"adv/std_final_conf": 0.799198567867279,
"adv/std_reasoning": 0.7206243276596069,
"adv/std_step_conf": 0.9329851865768433,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7522349936143038,
"calib/avg_num_step_conf": 5.78125,
"calib/ece": 0.1738554216867471,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6506024096385542,
"calib/gap": 0.3717943805874839,
"calib/mean_conf": 0.7402811244979919,
"calib/mu_c": 0.870185185185185,
"calib/mu_w": 0.4983908045977011,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.13176706827309248,
"calib/std_conf": 0.3756979065059484,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.3911706315789474,
"calib/step_q_c_n": 950.0,
"calib/step_q_gap": 0.0785217636544191,
"calib/step_q_w": 0.3126488679245283,
"calib/step_q_w_n": 530.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2545.0,
"completions/max_terminated_length": 2545.0,
"completions/mean_length": 511.7265625,
"completions/mean_terminated_length": 511.7265625,
"completions/min_length": 119.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.02649582363665104,
"kl": 0.07971954345703125,
"learning_rate": 2.916666666666667e-06,
"loss": -0.0095,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03568973019719124,
"mask/share_reasoning": 0.8352484107017517,
"mask/share_step_conf": 0.12906186282634735,
"num_tokens": 22888436.0,
"reward": 0.9772668480873108,
"reward_std": 0.18089798092842102,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/asymmetric_l2_reward": 0.867957592010498,
"rewards/final_brier_reward_step": 0.7670449018478394,
"rewards/format_reward_step": 0.96484375,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.6147359609603882,
"adv/mean_abs_reasoning": 0.4808635711669922,
"adv/mean_abs_step_conf": 0.7362314462661743,
"adv/ratio_final_to_reasoning": 1.2783999408990485,
"adv/ratio_step_to_reasoning": 1.5310609711595289,
"adv/std_final_conf": 0.8461417555809021,
"adv/std_reasoning": 0.7393701672554016,
"adv/std_step_conf": 0.9344803690910339,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7698014629049112,
"calib/avg_num_step_conf": 5.21484375,
"calib/ece": 0.17388888888888893,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6746031746031746,
"calib/gap": 0.4222988505747126,
"calib/mean_conf": 0.7508730158730158,
"calib/mu_c": 0.8966666666666666,
"calib/mu_w": 0.474367816091954,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.13500000000000006,
"calib/std_conf": 0.37173556876008074,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.415,
"calib/step_q_c_n": 790.0,
"calib/step_q_gap": 0.07909174311926603,
"calib/step_q_w": 0.33590825688073395,
"calib/step_q_w_n": 545.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2628.0,
"completions/max_terminated_length": 2628.0,
"completions/mean_length": 462.59765625,
"completions/mean_terminated_length": 462.59765625,
"completions/min_length": 177.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.1024,
"grad_norm": 0.026068033650517464,
"kl": 0.093292236328125,
"learning_rate": 2.888888888888889e-06,
"loss": 0.0366,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03652406111359596,
"mask/share_reasoning": 0.8425166606903076,
"mask/share_step_conf": 0.12095930427312851,
"num_tokens": 23112677.0,
"reward": 0.9878829717636108,
"reward_std": 0.2013242542743683,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/asymmetric_l2_reward": 0.849997878074646,
"rewards/final_brier_reward_step": 0.8007679581642151,
"rewards/format_reward_step": 0.98046875,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.7184832096099854,
"adv/mean_abs_reasoning": 0.59392249584198,
"adv/mean_abs_step_conf": 0.7223066091537476,
"adv/ratio_final_to_reasoning": 1.2097255359748929,
"adv/ratio_step_to_reasoning": 1.2161630755032482,
"adv/std_final_conf": 0.9094932079315186,
"adv/std_reasoning": 0.8265097737312317,
"adv/std_step_conf": 0.934262752532959,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6376647834274953,
"calib/avg_num_step_conf": 5.92578125,
"calib/ece": 0.28857312252964423,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5928853754940712,
"calib/gap": 0.2069691148775895,
"calib/mean_conf": 0.7016245059288537,
"calib/mu_c": 0.7981555555555556,
"calib/mu_w": 0.5911864406779661,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22830039525691698,
"calib/std_conf": 0.3787427366765338,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.40408163265306124,
"calib/step_q_c_n": 735.0,
"calib/step_q_gap": 0.07610209301111748,
"calib/step_q_w": 0.32797953964194376,
"calib/step_q_w_n": 782.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2470.0,
"completions/max_terminated_length": 2470.0,
"completions/mean_length": 460.6875,
"completions/mean_terminated_length": 462.494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.0355035662651062,
"kl": 0.0928802490234375,
"learning_rate": 2.861111111111111e-06,
"loss": -0.0871,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03594241291284561,
"mask/share_reasoning": 0.8263546228408813,
"mask/share_step_conf": 0.13379667699337006,
"num_tokens": 23335685.0,
"reward": 0.9316617250442505,
"reward_std": 0.1955932229757309,
"rewards/accuracy_reward_step": 0.53125,
"rewards/asymmetric_l2_reward": 0.8849480152130127,
"rewards/final_brier_reward_step": 0.6744691133499146,
"rewards/format_reward_step": 0.98828125,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.6633474826812744,
"adv/mean_abs_reasoning": 0.5149575471878052,
"adv/mean_abs_step_conf": 0.7673736214637756,
"adv/ratio_final_to_reasoning": 1.2881595508286654,
"adv/ratio_step_to_reasoning": 1.490168705467898,
"adv/std_final_conf": 0.8635463714599609,
"adv/std_reasoning": 0.7928749918937683,
"adv/std_step_conf": 0.9342027306556702,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7269079083927212,
"calib/avg_num_step_conf": 4.8359375,
"calib/ece": 0.24089430894308933,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.6382113821138211,
"calib/gap": 0.33125164325745526,
"calib/mean_conf": 0.718780487804878,
"calib/mu_c": 0.8493959731543624,
"calib/mu_w": 0.5181443298969072,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.1769918699186991,
"calib/std_conf": 0.3954385963986233,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.410972602739726,
"calib/step_q_c_n": 730.0,
"calib/step_q_gap": 0.04305724840901737,
"calib/step_q_w": 0.36791535433070865,
"calib/step_q_w_n": 508.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3024.0,
"completions/max_terminated_length": 3024.0,
"completions/mean_length": 510.96875,
"completions/mean_terminated_length": 512.9725952148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.054965581744909286,
"kl": 0.08089447021484375,
"learning_rate": 2.8333333333333335e-06,
"loss": -0.0625,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.035400211811065674,
"mask/share_reasoning": 0.8491687774658203,
"mask/share_step_conf": 0.11152474582195282,
"num_tokens": 23572677.0,
"reward": 0.932797372341156,
"reward_std": 0.24011409282684326,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.8460032343864441,
"rewards/final_brier_reward_step": 0.7133413553237915,
"rewards/format_reward_step": 0.94921875,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.7435926198959351,
"adv/mean_abs_reasoning": 0.6365935802459717,
"adv/mean_abs_step_conf": 0.7237546443939209,
"adv/ratio_final_to_reasoning": 1.1680806137074462,
"adv/ratio_step_to_reasoning": 1.1369179125467639,
"adv/std_final_conf": 0.9068244099617004,
"adv/std_reasoning": 0.859096884727478,
"adv/std_step_conf": 0.9345636367797852,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6776324614352783,
"calib/avg_num_step_conf": 5.875,
"calib/ece": 0.2825910931174089,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.4534412955465587,
"calib/gap": 0.30777598926894706,
"calib/mean_conf": 0.5718218623481782,
"calib/mu_c": 0.7487619047619049,
"calib/mu_w": 0.4409859154929578,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2146558704453441,
"calib/std_conf": 0.41920147804862407,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.3843027210884354,
"calib/step_q_c_n": 588.0,
"calib/step_q_gap": 0.06500141104476725,
"calib/step_q_w": 0.31930131004366813,
"calib/step_q_w_n": 916.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2887.0,
"completions/max_terminated_length": 2887.0,
"completions/mean_length": 586.73828125,
"completions/mean_terminated_length": 596.0516357421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.1056,
"grad_norm": 0.060040753334760666,
"kl": 0.07159423828125,
"learning_rate": 2.805555555555556e-06,
"loss": -0.0674,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.031188489869236946,
"mask/share_reasoning": 0.8459464311599731,
"mask/share_step_conf": 0.10724010318517685,
"num_tokens": 23828682.0,
"reward": 0.8999078273773193,
"reward_std": 0.24172960221767426,
"rewards/accuracy_reward_step": 0.41015625,
"rewards/asymmetric_l2_reward": 0.8456334471702576,
"rewards/final_brier_reward_step": 0.6799633502960205,
"rewards/format_reward_step": 0.9609375,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.7108124494552612,
"adv/mean_abs_reasoning": 0.5766913294792175,
"adv/mean_abs_step_conf": 0.7223371863365173,
"adv/ratio_final_to_reasoning": 1.232570030309216,
"adv/ratio_step_to_reasoning": 1.2525542684833249,
"adv/std_final_conf": 0.9088829755783081,
"adv/std_reasoning": 0.8098970055580139,
"adv/std_step_conf": 0.9337549805641174,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7749548037190084,
"calib/avg_num_step_conf": 5.78125,
"calib/ece": 0.19120481927710847,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5261044176706827,
"calib/gap": 0.4201213842975208,
"calib/mean_conf": 0.6380321285140562,
"calib/mu_c": 0.8421875000000001,
"calib/mu_w": 0.4220661157024793,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.15759036144578317,
"calib/std_conf": 0.4125205884747448,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.39832669322709163,
"calib/step_q_c_n": 753.0,
"calib/step_q_gap": 0.06741885278692661,
"calib/step_q_w": 0.330907840440165,
"calib/step_q_w_n": 727.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2536.0,
"completions/max_terminated_length": 2536.0,
"completions/mean_length": 584.765625,
"completions/mean_terminated_length": 584.765625,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.024518176913261414,
"kl": 0.07038116455078125,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.0789,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03139622509479523,
"mask/share_reasoning": 0.8543316721916199,
"mask/share_step_conf": 0.1142721176147461,
"num_tokens": 24085790.0,
"reward": 0.9550304412841797,
"reward_std": 0.22531947493553162,
"rewards/accuracy_reward_step": 0.5,
"rewards/asymmetric_l2_reward": 0.8710557818412781,
"rewards/final_brier_reward_step": 0.7460362911224365,
"rewards/format_reward_step": 0.96484375,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.6849584579467773,
"adv/mean_abs_reasoning": 0.5034070014953613,
"adv/mean_abs_step_conf": 0.7498930096626282,
"adv/ratio_final_to_reasoning": 1.3606454735673534,
"adv/ratio_step_to_reasoning": 1.4896356376353221,
"adv/std_final_conf": 0.8788121938705444,
"adv/std_reasoning": 0.7753551006317139,
"adv/std_step_conf": 0.9335606098175049,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6994607347489045,
"calib/avg_num_step_conf": 6.359375,
"calib/ece": 0.23389344262295084,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.4098360655737705,
"calib/gap": 0.30889046174587115,
"calib/mean_conf": 0.5250409836065575,
"calib/mu_c": 0.6883478260869564,
"calib/mu_w": 0.3794573643410853,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1438114754098361,
"calib/std_conf": 0.4189100787257916,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.3757354925775978,
"calib/step_q_c_n": 741.0,
"calib/step_q_gap": 0.07503650723374206,
"calib/step_q_w": 0.30069898534385575,
"calib/step_q_w_n": 887.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2754.0,
"completions/max_terminated_length": 2754.0,
"completions/mean_length": 583.92578125,
"completions/mean_terminated_length": 586.2156982421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.034379322081804276,
"kl": 0.0738525390625,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0217,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.030435508117079735,
"mask/share_reasoning": 0.8451032638549805,
"mask/share_step_conf": 0.12055499106645584,
"num_tokens": 24342267.0,
"reward": 0.9144086837768555,
"reward_std": 0.21262916922569275,
"rewards/accuracy_reward_step": 0.453125,
"rewards/asymmetric_l2_reward": 0.855229377746582,
"rewards/final_brier_reward_step": 0.6923378705978394,
"rewards/format_reward_step": 0.953125,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.5878695845603943,
"adv/mean_abs_reasoning": 0.3899923264980316,
"adv/mean_abs_step_conf": 0.7400535941123962,
"adv/ratio_final_to_reasoning": 1.5073875679534976,
"adv/ratio_step_to_reasoning": 1.8976106549525442,
"adv/std_final_conf": 0.8100722432136536,
"adv/std_reasoning": 0.681530773639679,
"adv/std_step_conf": 0.9334313869476318,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7806785051683011,
"calib/avg_num_step_conf": 5.70703125,
"calib/ece": 0.17003968253968255,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5595238095238095,
"calib/gap": 0.40557513914656784,
"calib/mean_conf": 0.6737698412698412,
"calib/mu_c": 0.8314935064935066,
"calib/mu_w": 0.4259183673469388,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.11634920634920634,
"calib/std_conf": 0.38713599724165704,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.40686440677966107,
"calib/step_q_c_n": 826.0,
"calib/step_q_gap": 0.0938722807954091,
"calib/step_q_w": 0.31299212598425197,
"calib/step_q_w_n": 635.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2741.0,
"completions/max_terminated_length": 2741.0,
"completions/mean_length": 480.0234375,
"completions/mean_terminated_length": 480.0234375,
"completions/min_length": 124.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.1088,
"grad_norm": 0.06203492358326912,
"kl": 0.08522796630859375,
"learning_rate": 2.7222222222222224e-06,
"loss": 0.0534,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03778192400932312,
"mask/share_reasoning": 0.8289196491241455,
"mask/share_step_conf": 0.13329845666885376,
"num_tokens": 24571849.0,
"reward": 0.9879881143569946,
"reward_std": 0.15715520083904266,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/asymmetric_l2_reward": 0.8699907064437866,
"rewards/final_brier_reward_step": 0.7887980341911316,
"rewards/format_reward_step": 0.984375,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.6246525049209595,
"adv/mean_abs_reasoning": 0.3913930654525757,
"adv/mean_abs_step_conf": 0.733914852142334,
"adv/ratio_final_to_reasoning": 1.595972335888632,
"adv/ratio_step_to_reasoning": 1.8751350417864288,
"adv/std_final_conf": 0.8219296932220459,
"adv/std_reasoning": 0.6816326975822449,
"adv/std_step_conf": 0.9333863854408264,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7566779346457214,
"calib/avg_num_step_conf": 5.890625,
"calib/ece": 0.19665322580645156,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5806451612903226,
"calib/gap": 0.37074898919599664,
"calib/mean_conf": 0.7041532258064516,
"calib/mu_c": 0.8641134751773051,
"calib/mu_w": 0.49336448598130844,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.16612903225806447,
"calib/std_conf": 0.38403745500202563,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4069795918367347,
"calib/step_q_c_n": 735.0,
"calib/step_q_gap": 0.09437933310452257,
"calib/step_q_w": 0.31260025873221214,
"calib/step_q_w_n": 773.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2898.0,
"completions/max_terminated_length": 2898.0,
"completions/mean_length": 587.85546875,
"completions/mean_terminated_length": 590.1608276367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.0305598396807909,
"kl": 0.06987762451171875,
"learning_rate": 2.6944444444444444e-06,
"loss": 0.0375,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03288574516773224,
"mask/share_reasoning": 0.8510973453521729,
"mask/share_step_conf": 0.11211065948009491,
"num_tokens": 24826892.0,
"reward": 0.958093523979187,
"reward_std": 0.1841270923614502,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/asymmetric_l2_reward": 0.8656498193740845,
"rewards/final_brier_reward_step": 0.7466309070587158,
"rewards/format_reward_step": 0.96875,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.6745511293411255,
"adv/mean_abs_reasoning": 0.437224805355072,
"adv/mean_abs_step_conf": 0.7339380383491516,
"adv/ratio_final_to_reasoning": 1.54280160018213,
"adv/ratio_step_to_reasoning": 1.678628543851984,
"adv/std_final_conf": 0.8667935729026794,
"adv/std_reasoning": 0.7206018567085266,
"adv/std_step_conf": 0.9330757260322571,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7567415025711777,
"calib/avg_num_step_conf": 6.078125,
"calib/ece": 0.18438735177865614,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4505928853754941,
"calib/gap": 0.4076044149002886,
"calib/mean_conf": 0.5739525691699605,
"calib/mu_c": 0.7656716417910449,
"calib/mu_w": 0.35806722689075626,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.11434782608695651,
"calib/std_conf": 0.4134958809953477,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.407767253044655,
"calib/step_q_c_n": 739.0,
"calib/step_q_gap": 0.12217361779373703,
"calib/step_q_w": 0.28559363525091797,
"calib/step_q_w_n": 817.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2485.0,
"completions/max_terminated_length": 2485.0,
"completions/mean_length": 541.33203125,
"completions/mean_terminated_length": 541.33203125,
"completions/min_length": 187.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.04134015738964081,
"kl": 0.07474517822265625,
"learning_rate": 2.666666666666667e-06,
"loss": -0.0286,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032088808715343475,
"mask/share_reasoning": 0.8443004488945007,
"mask/share_step_conf": 0.12361074984073639,
"num_tokens": 25072153.0,
"reward": 0.9815112948417664,
"reward_std": 0.16471192240715027,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/asymmetric_l2_reward": 0.8935236930847168,
"rewards/final_brier_reward_step": 0.7687175869941711,
"rewards/format_reward_step": 0.98046875,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.7218550443649292,
"adv/mean_abs_reasoning": 0.6229248642921448,
"adv/mean_abs_step_conf": 0.7113521099090576,
"adv/ratio_final_to_reasoning": 1.1588155903604889,
"adv/ratio_step_to_reasoning": 1.1419549141249905,
"adv/std_final_conf": 0.9065027236938477,
"adv/std_reasoning": 0.8430431485176086,
"adv/std_step_conf": 0.9336177110671997,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7497893577030267,
"calib/avg_num_step_conf": 5.77734375,
"calib/ece": 0.19215999999999994,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.496,
"calib/gap": 0.3931103765636139,
"calib/mean_conf": 0.6092000000000001,
"calib/mu_c": 0.7837410071942446,
"calib/mu_w": 0.39063063063063064,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.12267999999999994,
"calib/std_conf": 0.41203417334002773,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.3973221216041397,
"calib/step_q_c_n": 773.0,
"calib/step_q_gap": 0.08317197996108017,
"calib/step_q_w": 0.31415014164305954,
"calib/step_q_w_n": 706.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2826.0,
"completions/max_terminated_length": 2826.0,
"completions/mean_length": 542.796875,
"completions/mean_terminated_length": 542.796875,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.112,
"grad_norm": 0.05769439414143562,
"kl": 0.07154083251953125,
"learning_rate": 2.6388888888888893e-06,
"loss": 0.0324,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03391508758068085,
"mask/share_reasoning": 0.8441091179847717,
"mask/share_step_conf": 0.12197580933570862,
"num_tokens": 25316869.0,
"reward": 0.9692540168762207,
"reward_std": 0.21710465848445892,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/asymmetric_l2_reward": 0.8789149522781372,
"rewards/final_brier_reward_step": 0.7564679384231567,
"rewards/format_reward_step": 0.97265625,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.6788108348846436,
"adv/mean_abs_reasoning": 0.4150359034538269,
"adv/mean_abs_step_conf": 0.7503209114074707,
"adv/ratio_final_to_reasoning": 1.6355472604556531,
"adv/ratio_step_to_reasoning": 1.807845791565222,
"adv/std_final_conf": 0.8545103669166565,
"adv/std_reasoning": 0.7012789249420166,
"adv/std_step_conf": 0.9325715899467468,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7854966677245319,
"calib/avg_num_step_conf": 5.3671875,
"calib/ece": 0.18345238095238095,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5436507936507936,
"calib/gap": 0.39273310060298333,
"calib/mean_conf": 0.6619444444444444,
"calib/mu_c": 0.8411678832116789,
"calib/mu_w": 0.44843478260869557,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15087301587301588,
"calib/std_conf": 0.39241896960675976,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.40268258426966297,
"calib/step_q_c_n": 712.0,
"calib/step_q_gap": 0.08248620964730641,
"calib/step_q_w": 0.32019637462235656,
"calib/step_q_w_n": 662.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2670.0,
"completions/max_terminated_length": 2670.0,
"completions/mean_length": 481.9765625,
"completions/mean_terminated_length": 481.9765625,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.04111357033252716,
"kl": 0.0830841064453125,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.0094,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.035246968269348145,
"mask/share_reasoning": 0.8441534042358398,
"mask/share_step_conf": 0.1205996572971344,
"num_tokens": 25544839.0,
"reward": 0.9766442775726318,
"reward_std": 0.1666399985551834,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/asymmetric_l2_reward": 0.8827617168426514,
"rewards/final_brier_reward_step": 0.7666206955909729,
"rewards/format_reward_step": 0.984375,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.6700491905212402,
"adv/mean_abs_reasoning": 0.46548759937286377,
"adv/mean_abs_step_conf": 0.7599701881408691,
"adv/ratio_final_to_reasoning": 1.4394565857908472,
"adv/ratio_step_to_reasoning": 1.6326325108654927,
"adv/std_final_conf": 0.8719359636306763,
"adv/std_reasoning": 0.7206145524978638,
"adv/std_step_conf": 0.9324932098388672,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6578612753512619,
"calib/avg_num_step_conf": 6.25390625,
"calib/ece": 0.2519291338582677,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6653543307086615,
"calib/gap": 0.2677951554453557,
"calib/mean_conf": 0.7520866141732283,
"calib/mu_c": 0.8648979591836735,
"calib/mu_w": 0.5971028037383178,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21263779527559057,
"calib/std_conf": 0.3685606932704154,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.40989394285714287,
"calib/step_q_c_n": 875.0,
"calib/step_q_gap": 0.08921901172766628,
"calib/step_q_w": 0.3206749311294766,
"calib/step_q_w_n": 726.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2140.0,
"completions/max_terminated_length": 2140.0,
"completions/mean_length": 491.32421875,
"completions/mean_terminated_length": 491.32421875,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.03381947800517082,
"kl": 0.08031463623046875,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.0376,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03411445766687393,
"mask/share_reasoning": 0.8311688899993896,
"mask/share_step_conf": 0.13471662998199463,
"num_tokens": 25775234.0,
"reward": 0.9557232856750488,
"reward_std": 0.1572231650352478,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/asymmetric_l2_reward": 0.8837653398513794,
"rewards/final_brier_reward_step": 0.7151812314987183,
"rewards/format_reward_step": 0.98828125,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.5977965593338013,
"adv/mean_abs_reasoning": 0.40435507893562317,
"adv/mean_abs_step_conf": 0.7389947772026062,
"adv/ratio_final_to_reasoning": 1.4783950801542316,
"adv/ratio_step_to_reasoning": 1.8275887102688293,
"adv/std_final_conf": 0.8289056420326233,
"adv/std_reasoning": 0.6816290020942688,
"adv/std_step_conf": 0.9333050847053528,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7069702328323018,
"calib/avg_num_step_conf": 6.2578125,
"calib/ece": 0.19730158730158717,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7063492063492064,
"calib/gap": 0.33805481874447396,
"calib/mean_conf": 0.7677777777777778,
"calib/mu_c": 0.8724137931034484,
"calib/mu_w": 0.5343589743589744,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13730158730158717,
"calib/std_conf": 0.37361466629570667,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.42418063314711363,
"calib/step_q_c_n": 1074.0,
"calib/step_q_gap": 0.08563896648044694,
"calib/step_q_w": 0.3385416666666667,
"calib/step_q_w_n": 528.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2421.0,
"completions/max_terminated_length": 2421.0,
"completions/mean_length": 530.125,
"completions/mean_terminated_length": 532.2039794921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.1152,
"grad_norm": 0.053452033549547195,
"kl": 0.07540130615234375,
"learning_rate": 2.5555555555555557e-06,
"loss": -0.0121,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03260500729084015,
"mask/share_reasoning": 0.8317328691482544,
"mask/share_step_conf": 0.13175587356090546,
"num_tokens": 26014178.0,
"reward": 0.9928351044654846,
"reward_std": 0.16699038445949554,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/asymmetric_l2_reward": 0.8799116611480713,
"rewards/final_brier_reward_step": 0.7729461193084717,
"rewards/format_reward_step": 0.984375,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.6117569804191589,
"adv/mean_abs_reasoning": 0.3977644145488739,
"adv/mean_abs_step_conf": 0.7265390157699585,
"adv/ratio_final_to_reasoning": 1.5379882112204168,
"adv/ratio_step_to_reasoning": 1.826556095004038,
"adv/std_final_conf": 0.8428977131843567,
"adv/std_reasoning": 0.701329231262207,
"adv/std_step_conf": 0.9332946538925171,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8309944119888238,
"calib/avg_num_step_conf": 6.8203125,
"calib/ece": 0.15553784860557762,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.549800796812749,
"calib/gap": 0.5199441198882397,
"calib/mean_conf": 0.6278884462151394,
"calib/mu_c": 0.8909677419354839,
"calib/mu_w": 0.3710236220472441,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14470119521912345,
"calib/std_conf": 0.4216018631150103,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4085305105853051,
"calib/step_q_c_n": 803.0,
"calib/step_q_gap": 0.10964397824172079,
"calib/step_q_w": 0.2988865323435843,
"calib/step_q_w_n": 943.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2384.0,
"completions/max_terminated_length": 2384.0,
"completions/mean_length": 556.96484375,
"completions/mean_terminated_length": 556.96484375,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.02547648921608925,
"kl": 0.07645416259765625,
"learning_rate": 2.5277777777777778e-06,
"loss": -0.0174,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.031178954988718033,
"mask/share_reasoning": 0.8346405029296875,
"mask/share_step_conf": 0.13418057560920715,
"num_tokens": 26261361.0,
"reward": 0.9844216108322144,
"reward_std": 0.1782597303390503,
"rewards/accuracy_reward_step": 0.484375,
"rewards/asymmetric_l2_reward": 0.8859130144119263,
"rewards/final_brier_reward_step": 0.791523814201355,
"rewards/format_reward_step": 0.97265625,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.7054433226585388,
"adv/mean_abs_reasoning": 0.4843199551105499,
"adv/mean_abs_step_conf": 0.7825179100036621,
"adv/ratio_final_to_reasoning": 1.4565646432997288,
"adv/ratio_step_to_reasoning": 1.615704456829672,
"adv/std_final_conf": 0.8620911240577698,
"adv/std_reasoning": 0.7207316160202026,
"adv/std_step_conf": 0.9318588376045227,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6559316569954868,
"calib/avg_num_step_conf": 5.3671875,
"calib/ece": 0.27661354581673303,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5896414342629482,
"calib/gap": 0.260464861379755,
"calib/mean_conf": 0.6825896414342629,
"calib/mu_c": 0.7967375886524822,
"calib/mu_w": 0.5362727272727272,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1987250996015936,
"calib/std_conf": 0.4021932734586809,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.44165925925925925,
"calib/step_q_c_n": 675.0,
"calib/step_q_gap": 0.09314709903036078,
"calib/step_q_w": 0.34851216022889847,
"calib/step_q_w_n": 699.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2391.0,
"completions/max_terminated_length": 2391.0,
"completions/mean_length": 491.0390625,
"completions/mean_terminated_length": 491.0390625,
"completions/min_length": 99.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.04176861792802811,
"kl": 0.1174468994140625,
"learning_rate": 2.5e-06,
"loss": 0.0367,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03552088141441345,
"mask/share_reasoning": 0.8433677554130554,
"mask/share_step_conf": 0.12111136317253113,
"num_tokens": 26491987.0,
"reward": 0.929660439491272,
"reward_std": 0.18326841294765472,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/asymmetric_l2_reward": 0.8658431172370911,
"rewards/final_brier_reward_step": 0.6880090236663818,
"rewards/format_reward_step": 0.9765625,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.614783763885498,
"adv/mean_abs_reasoning": 0.5257919430732727,
"adv/mean_abs_step_conf": 0.7170236110687256,
"adv/ratio_final_to_reasoning": 1.169252918354102,
"adv/ratio_step_to_reasoning": 1.3637021649242036,
"adv/std_final_conf": 0.8417708277702332,
"adv/std_reasoning": 0.7755916118621826,
"adv/std_step_conf": 0.933610200881958,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6853343013662464,
"calib/avg_num_step_conf": 5.546875,
"calib/ece": 0.2681854838709678,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6370967741935484,
"calib/gap": 0.30863111345785776,
"calib/mean_conf": 0.7039919354838711,
"calib/mu_c": 0.8396402877697843,
"calib/mu_w": 0.5310091743119265,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.20584677419354847,
"calib/std_conf": 0.40234468043015986,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4302653631284916,
"calib/step_q_c_n": 716.0,
"calib/step_q_gap": 0.07023695403758251,
"calib/step_q_w": 0.3600284090909091,
"calib/step_q_w_n": 704.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2537.0,
"completions/max_terminated_length": 2537.0,
"completions/mean_length": 529.79296875,
"completions/mean_terminated_length": 531.87060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 48.0,
"epoch": 0.1184,
"grad_norm": 0.03763270750641823,
"kl": 0.0719451904296875,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.0694,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03530872240662575,
"mask/share_reasoning": 0.8400354385375977,
"mask/share_step_conf": 0.12074960768222809,
"num_tokens": 26735022.0,
"reward": 0.9260072708129883,
"reward_std": 0.21023571491241455,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/asymmetric_l2_reward": 0.8536633253097534,
"rewards/final_brier_reward_step": 0.6967886686325073,
"rewards/format_reward_step": 0.96484375,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.6220129728317261,
"adv/mean_abs_reasoning": 0.4993098974227905,
"adv/mean_abs_step_conf": 0.7455708384513855,
"adv/ratio_final_to_reasoning": 1.2457453297887198,
"adv/ratio_step_to_reasoning": 1.4932026028317913,
"adv/std_final_conf": 0.8272049427032471,
"adv/std_reasoning": 0.7575883865356445,
"adv/std_step_conf": 0.9338021874427795,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8360327743902439,
"calib/avg_num_step_conf": 5.56640625,
"calib/ece": 0.16860557768924303,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5059760956175299,
"calib/gap": 0.5284616361788618,
"calib/mean_conf": 0.5791235059760956,
"calib/mu_c": 0.8486178861788618,
"calib/mu_w": 0.32015625000000003,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1288446215139442,
"calib/std_conf": 0.4347972333728267,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.45525581395348835,
"calib/step_q_c_n": 645.0,
"calib/step_q_gap": 0.12670453190220632,
"calib/step_q_w": 0.32855128205128203,
"calib/step_q_w_n": 780.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2709.0,
"completions/max_terminated_length": 2709.0,
"completions/mean_length": 565.5390625,
"completions/mean_terminated_length": 567.7568969726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.0486106239259243,
"kl": 0.07489013671875,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.0159,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.030696198344230652,
"mask/share_reasoning": 0.8548277020454407,
"mask/share_step_conf": 0.11056986451148987,
"num_tokens": 26987720.0,
"reward": 0.973731517791748,
"reward_std": 0.18799945712089539,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/asymmetric_l2_reward": 0.8586279153823853,
"rewards/final_brier_reward_step": 0.7974288463592529,
"rewards/format_reward_step": 0.9765625,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.6730247139930725,
"adv/mean_abs_reasoning": 0.5346618890762329,
"adv/mean_abs_step_conf": 0.730544924736023,
"adv/ratio_final_to_reasoning": 1.258785650789319,
"adv/ratio_step_to_reasoning": 1.3663680536464433,
"adv/std_final_conf": 0.8757805824279785,
"adv/std_reasoning": 0.8097303509712219,
"adv/std_step_conf": 0.933262050151825,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7831081081081082,
"calib/avg_num_step_conf": 6.359375,
"calib/ece": 0.1924557768924303,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6454183266932271,
"calib/gap": 0.4057848133848132,
"calib/mean_conf": 0.7248350597609562,
"calib/mu_c": 0.9042857142857142,
"calib/mu_w": 0.49850090090090104,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1797609561752988,
"calib/std_conf": 0.3822900440956289,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40888641425389755,
"calib/step_q_c_n": 898.0,
"calib/step_q_gap": 0.06416038685663722,
"calib/step_q_w": 0.3447260273972603,
"calib/step_q_w_n": 730.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2549.0,
"completions/max_terminated_length": 2549.0,
"completions/mean_length": 490.23828125,
"completions/mean_terminated_length": 490.23828125,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.041349541395902634,
"kl": 0.0941314697265625,
"learning_rate": 2.4166666666666667e-06,
"loss": 0.0675,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.035144634544849396,
"mask/share_reasoning": 0.8297086954116821,
"mask/share_step_conf": 0.1351466178894043,
"num_tokens": 27218421.0,
"reward": 0.982533872127533,
"reward_std": 0.20815324783325195,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/asymmetric_l2_reward": 0.894577145576477,
"rewards/final_brier_reward_step": 0.7642405033111572,
"rewards/format_reward_step": 0.98046875,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.5990077257156372,
"adv/mean_abs_reasoning": 0.44762060046195984,
"adv/mean_abs_step_conf": 0.7410391569137573,
"adv/ratio_final_to_reasoning": 1.3382041065523809,
"adv/ratio_step_to_reasoning": 1.6555072669778368,
"adv/std_final_conf": 0.8304715156555176,
"adv/std_reasoning": 0.7391616702079773,
"adv/std_step_conf": 0.9329032897949219,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8231817271087959,
"calib/avg_num_step_conf": 5.91015625,
"calib/ece": 0.19153543307086618,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7401574803149606,
"calib/gap": 0.44428170707273107,
"calib/mean_conf": 0.7983858267716535,
"calib/mu_c": 0.9610559006211181,
"calib/mu_w": 0.516774193548387,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17803149606299218,
"calib/std_conf": 0.3566744815421641,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.44985092491838957,
"calib/step_q_c_n": 919.0,
"calib/step_q_gap": 0.11490142996889463,
"calib/step_q_w": 0.33494949494949494,
"calib/step_q_w_n": 594.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2880.0,
"completions/max_terminated_length": 2880.0,
"completions/mean_length": 496.80859375,
"completions/mean_terminated_length": 498.75689697265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.1216,
"grad_norm": 0.05756969749927521,
"kl": 0.081695556640625,
"learning_rate": 2.388888888888889e-06,
"loss": 0.0596,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0352327898144722,
"mask/share_reasoning": 0.8273290395736694,
"mask/share_step_conf": 0.13353195786476135,
"num_tokens": 27450628.0,
"reward": 1.0205554962158203,
"reward_std": 0.18426382541656494,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/asymmetric_l2_reward": 0.903445839881897,
"rewards/final_brier_reward_step": 0.8134465217590332,
"rewards/format_reward_step": 0.9921875,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.6113198399543762,
"adv/mean_abs_reasoning": 0.49032288789749146,
"adv/mean_abs_step_conf": 0.7666841745376587,
"adv/ratio_final_to_reasoning": 1.2467699449554979,
"adv/ratio_step_to_reasoning": 1.5636312182472385,
"adv/std_final_conf": 0.8144198060035706,
"adv/std_reasoning": 0.739387035369873,
"adv/std_step_conf": 0.9330512881278992,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6646076046600947,
"calib/avg_num_step_conf": 5.3671875,
"calib/ece": 0.3328063241106719,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7747035573122529,
"calib/gap": 0.16428946357700702,
"calib/mean_conf": 0.8259288537549406,
"calib/mu_c": 0.8954109589041097,
"calib/mu_w": 0.7311214953271027,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.29083003952569164,
"calib/std_conf": 0.3312970480832067,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.45648578811369506,
"calib/step_q_c_n": 774.0,
"calib/step_q_gap": 0.07716912144702831,
"calib/step_q_w": 0.37931666666666675,
"calib/step_q_w_n": 600.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2616.0,
"completions/max_terminated_length": 2616.0,
"completions/mean_length": 466.1875,
"completions/mean_terminated_length": 466.1875,
"completions/min_length": 115.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.0432070791721344,
"kl": 0.07665252685546875,
"learning_rate": 2.361111111111111e-06,
"loss": 0.0476,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03583553060889244,
"mask/share_reasoning": 0.835993766784668,
"mask/share_step_conf": 0.12817072868347168,
"num_tokens": 27675236.0,
"reward": 0.9144834280014038,
"reward_std": 0.20204773545265198,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/asymmetric_l2_reward": 0.8649461269378662,
"rewards/final_brier_reward_step": 0.6530832052230835,
"rewards/format_reward_step": 0.984375,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.6029642820358276,
"adv/mean_abs_reasoning": 0.39965057373046875,
"adv/mean_abs_step_conf": 0.7622563242912292,
"adv/ratio_final_to_reasoning": 1.508728678674379,
"adv/ratio_step_to_reasoning": 1.9073069686253175,
"adv/std_final_conf": 0.8000175952911377,
"adv/std_reasoning": 0.661288321018219,
"adv/std_step_conf": 0.9327738285064697,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7190315315315317,
"calib/avg_num_step_conf": 6.15625,
"calib/ece": 0.3134901960784313,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.803921568627451,
"calib/gap": 0.25926801801801813,
"calib/mean_conf": 0.8483921568627453,
"calib/mu_c": 0.9612500000000002,
"calib/mu_w": 0.701981981981982,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2985882352941176,
"calib/std_conf": 0.32429374716187354,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4388196176226101,
"calib/step_q_c_n": 802.0,
"calib/step_q_gap": 0.10645527653733883,
"calib/step_q_w": 0.3323643410852713,
"calib/step_q_w_n": 774.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2569.0,
"completions/max_terminated_length": 2569.0,
"completions/mean_length": 524.25,
"completions/mean_terminated_length": 524.25,
"completions/min_length": 99.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.028108853846788406,
"kl": 0.0692291259765625,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.0478,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.034591950476169586,
"mask/share_reasoning": 0.838742733001709,
"mask/share_step_conf": 0.1266653686761856,
"num_tokens": 27913964.0,
"reward": 0.9537807106971741,
"reward_std": 0.19260621070861816,
"rewards/accuracy_reward_step": 0.5625,
"rewards/asymmetric_l2_reward": 0.9025558829307556,
"rewards/final_brier_reward_step": 0.6932867169380188,
"rewards/format_reward_step": 0.99609375,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.6685183048248291,
"adv/mean_abs_reasoning": 0.4874965250492096,
"adv/mean_abs_step_conf": 0.740862250328064,
"adv/ratio_final_to_reasoning": 1.37132937461933,
"adv/ratio_step_to_reasoning": 1.519728269351825,
"adv/std_final_conf": 0.8593764305114746,
"adv/std_reasoning": 0.7574522495269775,
"adv/std_step_conf": 0.9332804679870605,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6033924680983505,
"calib/avg_num_step_conf": 5.78515625,
"calib/ece": 0.44433070866141733,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.8622047244094488,
"calib/gap": 0.1423678804855275,
"calib/mean_conf": 0.8918110236220473,
"calib/mu_c": 0.9674789915966386,
"calib/mu_w": 0.8251111111111111,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.4338188976377953,
"calib/std_conf": 0.27420153896899985,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.44812593703148423,
"calib/step_q_c_n": 667.0,
"calib/step_q_gap": 0.06114805005359725,
"calib/step_q_w": 0.386977886977887,
"calib/step_q_w_n": 814.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1846.0,
"completions/max_terminated_length": 1846.0,
"completions/mean_length": 502.703125,
"completions/mean_terminated_length": 502.703125,
"completions/min_length": 199.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.1248,
"grad_norm": 0.04808332771062851,
"kl": 0.07646942138671875,
"learning_rate": 2.305555555555556e-06,
"loss": -0.0376,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03324298560619354,
"mask/share_reasoning": 0.8397763967514038,
"mask/share_step_conf": 0.12698057293891907,
"num_tokens": 28149256.0,
"reward": 0.8545684814453125,
"reward_std": 0.2120169848203659,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/asymmetric_l2_reward": 0.8599258661270142,
"rewards/final_brier_reward_step": 0.5593671798706055,
"rewards/format_reward_step": 0.984375,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.5423208475112915,
"adv/mean_abs_reasoning": 0.4119876027107239,
"adv/mean_abs_step_conf": 0.757232129573822,
"adv/ratio_final_to_reasoning": 1.3163523463886384,
"adv/ratio_step_to_reasoning": 1.8379973683468112,
"adv/std_final_conf": 0.7762859463691711,
"adv/std_reasoning": 0.7013146877288818,
"adv/std_step_conf": 0.9321770668029785,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6684493754982728,
"calib/avg_num_step_conf": 7.18359375,
"calib/ece": 0.3157258064516129,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.8266129032258065,
"calib/gap": 0.24094339622641503,
"calib/mean_conf": 0.8620161290322582,
"calib/mu_c": 0.965,
"calib/mu_w": 0.7240566037735849,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.30258064516129035,
"calib/std_conf": 0.3080400577184876,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4405574516496018,
"calib/step_q_c_n": 879.0,
"calib/step_q_gap": 0.10321370164960181,
"calib/step_q_w": 0.33734375,
"calib/step_q_w_n": 960.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2737.0,
"completions/max_terminated_length": 2737.0,
"completions/mean_length": 588.1640625,
"completions/mean_terminated_length": 590.4706420898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.045879751443862915,
"kl": 0.06640625,
"learning_rate": 2.277777777777778e-06,
"loss": 0.0014,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03145633265376091,
"mask/share_reasoning": 0.8349190950393677,
"mask/share_step_conf": 0.1297183483839035,
"num_tokens": 28403834.0,
"reward": 0.9131325483322144,
"reward_std": 0.18790775537490845,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/asymmetric_l2_reward": 0.8495236039161682,
"rewards/final_brier_reward_step": 0.672835111618042,
"rewards/format_reward_step": 0.96484375,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.579788863658905,
"adv/mean_abs_reasoning": 0.5056012868881226,
"adv/mean_abs_step_conf": 0.7397458553314209,
"adv/ratio_final_to_reasoning": 1.146731384382727,
"adv/ratio_step_to_reasoning": 1.4631012114000195,
"adv/std_final_conf": 0.797834575176239,
"adv/std_reasoning": 0.7394025921821594,
"adv/std_step_conf": 0.9339104890823364,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7308721765243504,
"calib/avg_num_step_conf": 6.29296875,
"calib/ece": 0.2750200803212853,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.7309236947791165,
"calib/gap": 0.310268311790051,
"calib/mean_conf": 0.7846586345381525,
"calib/mu_c": 0.9229710144927536,
"calib/mu_w": 0.6127027027027027,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.25273092369477923,
"calib/std_conf": 0.3692910606631113,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4636363636363636,
"calib/step_q_c_n": 770.0,
"calib/step_q_gap": 0.15116311750081074,
"calib/step_q_w": 0.3124732461355529,
"calib/step_q_w_n": 841.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2918.0,
"completions/max_terminated_length": 2918.0,
"completions/mean_length": 560.40625,
"completions/mean_terminated_length": 562.6039428710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.0336654931306839,
"kl": 0.0772705078125,
"learning_rate": 2.25e-06,
"loss": -0.0541,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03116082400083542,
"mask/share_reasoning": 0.8479986190795898,
"mask/share_step_conf": 0.11693429946899414,
"num_tokens": 28652362.0,
"reward": 0.9408432841300964,
"reward_std": 0.20290254056453705,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/asymmetric_l2_reward": 0.8821717500686646,
"rewards/final_brier_reward_step": 0.6971710920333862,
"rewards/format_reward_step": 0.97265625,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.6259399652481079,
"adv/mean_abs_reasoning": 0.38975387811660767,
"adv/mean_abs_step_conf": 0.7443583011627197,
"adv/ratio_final_to_reasoning": 1.6059877794489512,
"adv/ratio_step_to_reasoning": 1.9098162788261481,
"adv/std_final_conf": 0.81560218334198,
"adv/std_reasoning": 0.6612488627433777,
"adv/std_step_conf": 0.933025598526001,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7535215776667947,
"calib/avg_num_step_conf": 5.359375,
"calib/ece": 0.23131474103585664,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6932270916334662,
"calib/gap": 0.4136899731079523,
"calib/mean_conf": 0.7381673306772908,
"calib/mu_c": 0.9260583941605839,
"calib/mu_w": 0.5123684210526316,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.21183266932270922,
"calib/std_conf": 0.40169539594567233,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4514522821576764,
"calib/step_q_c_n": 723.0,
"calib/step_q_gap": 0.10832439309758396,
"calib/step_q_w": 0.34312788906009245,
"calib/step_q_w_n": 649.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2444.0,
"completions/max_terminated_length": 2444.0,
"completions/mean_length": 492.75390625,
"completions/mean_terminated_length": 494.6863098144531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.128,
"grad_norm": 0.03507012873888016,
"kl": 0.07172393798828125,
"learning_rate": 2.222222222222222e-06,
"loss": -0.0091,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03331432864069939,
"mask/share_reasoning": 0.845699667930603,
"mask/share_step_conf": 0.1170797273516655,
"num_tokens": 28885195.0,
"reward": 0.9636802673339844,
"reward_std": 0.18801307678222656,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/asymmetric_l2_reward": 0.8802074193954468,
"rewards/final_brier_reward_step": 0.7440280914306641,
"rewards/format_reward_step": 0.98046875,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.693418025970459,
"adv/mean_abs_reasoning": 0.5543345212936401,
"adv/mean_abs_step_conf": 0.734613835811615,
"adv/ratio_final_to_reasoning": 1.250901755770581,
"adv/ratio_step_to_reasoning": 1.325217549318885,
"adv/std_final_conf": 0.8799854516983032,
"adv/std_reasoning": 0.7755146622657776,
"adv/std_step_conf": 0.9336560368537903,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6444260520786912,
"calib/avg_num_step_conf": 6.58984375,
"calib/ece": 0.33027777777777784,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7738095238095238,
"calib/gap": 0.20913541732985297,
"calib/mean_conf": 0.8230555555555555,
"calib/mu_c": 0.9168345323741007,
"calib/mu_w": 0.7076991150442478,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30087301587301596,
"calib/std_conf": 0.33569725603172723,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.410047281323877,
"calib/step_q_c_n": 846.0,
"calib/step_q_gap": 0.07631363090770577,
"calib/step_q_w": 0.33373365041617126,
"calib/step_q_w_n": 841.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2541.0,
"completions/max_terminated_length": 2541.0,
"completions/mean_length": 566.36328125,
"completions/mean_terminated_length": 568.5843505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.05059061199426651,
"kl": 0.06748199462890625,
"learning_rate": 2.1944444444444445e-06,
"loss": 0.1015,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03129071742296219,
"mask/share_reasoning": 0.8439393043518066,
"mask/share_step_conf": 0.12086370587348938,
"num_tokens": 29135240.0,
"reward": 0.92536461353302,
"reward_std": 0.23206710815429688,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/asymmetric_l2_reward": 0.8859968185424805,
"rewards/final_brier_reward_step": 0.6592636704444885,
"rewards/format_reward_step": 0.984375,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.637088418006897,
"adv/mean_abs_reasoning": 0.4608699381351471,
"adv/mean_abs_step_conf": 0.7540398836135864,
"adv/ratio_final_to_reasoning": 1.3823605431606063,
"adv/ratio_step_to_reasoning": 1.63612295187817,
"adv/std_final_conf": 0.842920184135437,
"adv/std_reasoning": 0.7205365896224976,
"adv/std_step_conf": 0.9331433176994324,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7871930050147871,
"calib/avg_num_step_conf": 5.69921875,
"calib/ece": 0.18192156862745107,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6745098039215687,
"calib/gap": 0.4593088594573743,
"calib/mean_conf": 0.7199607843137256,
"calib/mu_c": 0.9018831168831168,
"calib/mu_w": 0.4425742574257425,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14898039215686285,
"calib/std_conf": 0.40719171958838457,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4631063321385902,
"calib/step_q_c_n": 837.0,
"calib/step_q_gap": 0.13792948326399213,
"calib/step_q_w": 0.3251768488745981,
"calib/step_q_w_n": 622.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3011.0,
"completions/max_terminated_length": 3011.0,
"completions/mean_length": 493.8046875,
"completions/mean_terminated_length": 493.8046875,
"completions/min_length": 110.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.060518212616443634,
"kl": 0.072021484375,
"learning_rate": 2.166666666666667e-06,
"loss": -0.0451,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.034646786749362946,
"mask/share_reasoning": 0.8425935506820679,
"mask/share_step_conf": 0.12275967001914978,
"num_tokens": 29368998.0,
"reward": 1.011244773864746,
"reward_std": 0.18983519077301025,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/asymmetric_l2_reward": 0.9048250913619995,
"rewards/final_brier_reward_step": 0.7981331944465637,
"rewards/format_reward_step": 0.99609375,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.7194132804870605,
"adv/mean_abs_reasoning": 0.5484261512756348,
"adv/mean_abs_step_conf": 0.7414064407348633,
"adv/ratio_final_to_reasoning": 1.3117778552567398,
"adv/ratio_step_to_reasoning": 1.3518801738581538,
"adv/std_final_conf": 0.8892462849617004,
"adv/std_reasoning": 0.792913556098938,
"adv/std_step_conf": 0.9334316253662109,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7523837706319458,
"calib/avg_num_step_conf": 6.34765625,
"calib/ece": 0.2340725806451614,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5645161290322581,
"calib/gap": 0.3882784244098114,
"calib/mean_conf": 0.6297177419354838,
"calib/mu_c": 0.8035036496350366,
"calib/mu_w": 0.4152252252252252,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15568548387096784,
"calib/std_conf": 0.431571959022196,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3978266331658291,
"calib/step_q_c_n": 796.0,
"calib/step_q_gap": 0.07244665729128147,
"calib/step_q_w": 0.32537997587454764,
"calib/step_q_w_n": 829.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2897.0,
"completions/max_terminated_length": 2897.0,
"completions/mean_length": 589.12109375,
"completions/mean_terminated_length": 591.431396484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.1312,
"grad_norm": 0.038368623703718185,
"kl": 0.0644683837890625,
"learning_rate": 2.138888888888889e-06,
"loss": 0.0179,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.02929634228348732,
"mask/share_reasoning": 0.8502581119537354,
"mask/share_step_conf": 0.1165393590927124,
"num_tokens": 29625101.0,
"reward": 0.9518745541572571,
"reward_std": 0.21453779935836792,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/asymmetric_l2_reward": 0.8731791973114014,
"rewards/final_brier_reward_step": 0.7290074229240417,
"rewards/format_reward_step": 0.96875,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.6313794851303101,
"adv/mean_abs_reasoning": 0.46017879247665405,
"adv/mean_abs_step_conf": 0.7264816761016846,
"adv/ratio_final_to_reasoning": 1.3720308181354128,
"adv/ratio_step_to_reasoning": 1.578694385701272,
"adv/std_final_conf": 0.85479336977005,
"adv/std_reasoning": 0.7573562860488892,
"adv/std_step_conf": 0.9332188367843628,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7461487820934826,
"calib/avg_num_step_conf": 5.765625,
"calib/ece": 0.2508300395256916,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5810276679841897,
"calib/gap": 0.3679802501645819,
"calib/mean_conf": 0.6370750988142293,
"calib/mu_c": 0.7796129032258065,
"calib/mu_w": 0.41163265306122454,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.13762845849802363,
"calib/std_conf": 0.43541918185282724,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4402631578947368,
"calib/step_q_c_n": 874.0,
"calib/step_q_gap": 0.0714923937751355,
"calib/step_q_w": 0.3687707641196013,
"calib/step_q_w_n": 602.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2694.0,
"completions/max_terminated_length": 2694.0,
"completions/mean_length": 528.83203125,
"completions/mean_terminated_length": 528.83203125,
"completions/min_length": 121.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.10215196013450623,
"kl": 0.08013153076171875,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.0337,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03226089105010033,
"mask/share_reasoning": 0.8445743322372437,
"mask/share_step_conf": 0.12316481024026871,
"num_tokens": 29867298.0,
"reward": 0.9671303033828735,
"reward_std": 0.17288029193878174,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/asymmetric_l2_reward": 0.8771121501922607,
"rewards/final_brier_reward_step": 0.7383984327316284,
"rewards/format_reward_step": 0.98828125,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.7052055597305298,
"adv/mean_abs_reasoning": 0.4439585208892822,
"adv/mean_abs_step_conf": 0.7640683650970459,
"adv/ratio_final_to_reasoning": 1.5884492053851116,
"adv/ratio_step_to_reasoning": 1.7210354777436407,
"adv/std_final_conf": 0.8930036425590515,
"adv/std_reasoning": 0.739206075668335,
"adv/std_step_conf": 0.932820737361908,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6647173489278753,
"calib/avg_num_step_conf": 5.98828125,
"calib/ece": 0.31192771084337334,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5943775100401606,
"calib/gap": 0.25868226120857696,
"calib/mean_conf": 0.635863453815261,
"calib/mu_c": 0.7542962962962962,
"calib/mu_w": 0.4956140350877193,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20281124497991954,
"calib/std_conf": 0.4397463642706736,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.44329396325459314,
"calib/step_q_c_n": 762.0,
"calib/step_q_gap": 0.10667917726237525,
"calib/step_q_w": 0.3366147859922179,
"calib/step_q_w_n": 771.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2545.0,
"completions/max_terminated_length": 2545.0,
"completions/mean_length": 530.0,
"completions/mean_terminated_length": 534.1732177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.03873305022716522,
"kl": 0.06960296630859375,
"learning_rate": 2.0833333333333334e-06,
"loss": -0.0536,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03307991474866867,
"mask/share_reasoning": 0.8385022878646851,
"mask/share_step_conf": 0.12060528248548508,
"num_tokens": 30107786.0,
"reward": 0.8997880220413208,
"reward_std": 0.20577451586723328,
"rewards/accuracy_reward_step": 0.53125,
"rewards/asymmetric_l2_reward": 0.8392912149429321,
"rewards/final_brier_reward_step": 0.6595035195350647,
"rewards/format_reward_step": 0.97265625,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.6224679350852966,
"adv/mean_abs_reasoning": 0.45161741971969604,
"adv/mean_abs_step_conf": 0.7560645341873169,
"adv/ratio_final_to_reasoning": 1.378308072066046,
"adv/ratio_step_to_reasoning": 1.67412615451499,
"adv/std_final_conf": 0.8212738633155823,
"adv/std_reasoning": 0.7205803990364075,
"adv/std_step_conf": 0.931928813457489,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.797762148337596,
"calib/avg_num_step_conf": 6.77734375,
"calib/ece": 0.20988047808764945,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5617529880478087,
"calib/gap": 0.43679859335038357,
"calib/mean_conf": 0.6259760956175299,
"calib/mu_c": 0.8261029411764705,
"calib/mu_w": 0.38930434782608697,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1470119521912351,
"calib/std_conf": 0.43423622491700775,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42935049019607846,
"calib/step_q_c_n": 816.0,
"calib/step_q_gap": 0.1333330799675692,
"calib/step_q_w": 0.29601741022850925,
"calib/step_q_w_n": 919.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2574.0,
"completions/max_terminated_length": 2574.0,
"completions/mean_length": 555.703125,
"completions/mean_terminated_length": 557.8823852539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.1344,
"grad_norm": 0.051755405962467194,
"kl": 0.06482315063476562,
"learning_rate": 2.0555555555555555e-06,
"loss": -0.027,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03272949904203415,
"mask/share_reasoning": 0.8290582299232483,
"mask/share_step_conf": 0.13430599868297577,
"num_tokens": 30355510.0,
"reward": 0.9703141450881958,
"reward_std": 0.1568230241537094,
"rewards/accuracy_reward_step": 0.53125,
"rewards/asymmetric_l2_reward": 0.8804025650024414,
"rewards/final_brier_reward_step": 0.7578819990158081,
"rewards/format_reward_step": 0.98046875,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.650374174118042,
"adv/mean_abs_reasoning": 0.46083492040634155,
"adv/mean_abs_step_conf": 0.7511869668960571,
"adv/ratio_final_to_reasoning": 1.4112953366133236,
"adv/ratio_step_to_reasoning": 1.630056520529515,
"adv/std_final_conf": 0.8628984093666077,
"adv/std_reasoning": 0.739283561706543,
"adv/std_step_conf": 0.9332513809204102,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.8018252933507171,
"calib/avg_num_step_conf": 6.26171875,
"calib/ece": 0.21907258064516133,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.5362903225806451,
"calib/gap": 0.46292959582790094,
"calib/mean_conf": 0.5784274193548387,
"calib/mu_c": 0.7986923076923077,
"calib/mu_w": 0.33576271186440676,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13665322580645164,
"calib/std_conf": 0.4548098091224347,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4438493150684931,
"calib/step_q_c_n": 730.0,
"calib/step_q_gap": 0.14875194966184935,
"calib/step_q_w": 0.2950973654066438,
"calib/step_q_w_n": 873.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3006.0,
"completions/max_terminated_length": 3006.0,
"completions/mean_length": 549.046875,
"completions/mean_terminated_length": 549.046875,
"completions/min_length": 136.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.035358961671590805,
"kl": 0.07138442993164062,
"learning_rate": 2.027777777777778e-06,
"loss": 0.1926,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03422444313764572,
"mask/share_reasoning": 0.831305980682373,
"mask/share_step_conf": 0.13446959853172302,
"num_tokens": 30599738.0,
"reward": 0.9445489645004272,
"reward_std": 0.20476898550987244,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/asymmetric_l2_reward": 0.860126793384552,
"rewards/final_brier_reward_step": 0.7360023260116577,
"rewards/format_reward_step": 0.95703125,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.704669177532196,
"adv/mean_abs_reasoning": 0.5308483839035034,
"adv/mean_abs_step_conf": 0.761113166809082,
"adv/ratio_final_to_reasoning": 1.3274396209903305,
"adv/ratio_step_to_reasoning": 1.43376751232125,
"adv/std_final_conf": 0.85466468334198,
"adv/std_reasoning": 0.7577895522117615,
"adv/std_step_conf": 0.9339450001716614,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7072817820849318,
"calib/avg_num_step_conf": 5.4296875,
"calib/ece": 0.27745901639344256,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.5778688524590164,
"calib/gap": 0.3277353792314422,
"calib/mean_conf": 0.6243442622950819,
"calib/mu_c": 0.781496062992126,
"calib/mu_w": 0.45376068376068385,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.190655737704918,
"calib/std_conf": 0.44240121191419035,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.4389984101748808,
"calib/step_q_c_n": 629.0,
"calib/step_q_gap": 0.10517449427474934,
"calib/step_q_w": 0.33382391590013144,
"calib/step_q_w_n": 761.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3005.0,
"completions/max_terminated_length": 3005.0,
"completions/mean_length": 533.0390625,
"completions/mean_terminated_length": 539.3596801757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.026687778532505035,
"kl": 0.0691680908203125,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0484,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.034037213772535324,
"mask/share_reasoning": 0.8381680846214294,
"mask/share_step_conf": 0.11607595533132553,
"num_tokens": 30842860.0,
"reward": 0.8984044790267944,
"reward_std": 0.2292974293231964,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/asymmetric_l2_reward": 0.8419756889343262,
"rewards/final_brier_reward_step": 0.666551947593689,
"rewards/format_reward_step": 0.9453125,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.5893256664276123,
"adv/mean_abs_reasoning": 0.3948642611503601,
"adv/mean_abs_step_conf": 0.7625530958175659,
"adv/ratio_final_to_reasoning": 1.4924765910967146,
"adv/ratio_step_to_reasoning": 1.9311778016982748,
"adv/std_final_conf": 0.8301159143447876,
"adv/std_reasoning": 0.6815221309661865,
"adv/std_step_conf": 0.9312430024147034,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6886455219030286,
"calib/avg_num_step_conf": 6.2890625,
"calib/ece": 0.2615294117647058,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6627450980392157,
"calib/gap": 0.2917977382035617,
"calib/mean_conf": 0.7284313725490196,
"calib/mu_c": 0.8405732484076434,
"calib/mu_w": 0.5487755102040817,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18713725490196073,
"calib/std_conf": 0.4012772380298454,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43962121212121213,
"calib/step_q_c_n": 924.0,
"calib/step_q_gap": 0.10281363194628501,
"calib/step_q_w": 0.3368075801749271,
"calib/step_q_w_n": 686.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2646.0,
"completions/max_terminated_length": 2646.0,
"completions/mean_length": 483.7578125,
"completions/mean_terminated_length": 483.7578125,
"completions/min_length": 140.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.1376,
"grad_norm": 0.042509667575359344,
"kl": 0.07884979248046875,
"learning_rate": 1.9722222222222224e-06,
"loss": 0.0618,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.0361248143017292,
"mask/share_reasoning": 0.8258543610572815,
"mask/share_step_conf": 0.1380208432674408,
"num_tokens": 31069086.0,
"reward": 0.970730721950531,
"reward_std": 0.1526256501674652,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/asymmetric_l2_reward": 0.8946923017501831,
"rewards/final_brier_reward_step": 0.7248941659927368,
"rewards/format_reward_step": 0.99609375,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.5929268002510071,
"adv/mean_abs_reasoning": 0.2917104959487915,
"adv/mean_abs_step_conf": 0.7566444873809814,
"adv/ratio_final_to_reasoning": 2.0325864461013183,
"adv/ratio_step_to_reasoning": 2.5938198929729532,
"adv/std_final_conf": 0.8083614706993103,
"adv/std_reasoning": 0.5727423429489136,
"adv/std_step_conf": 0.9330140948295593,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7233320147679325,
"calib/avg_num_step_conf": 5.33203125,
"calib/ece": 0.2249212598425196,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6889763779527559,
"calib/gap": 0.385006592827004,
"calib/mean_conf": 0.7233464566929134,
"calib/mu_c": 0.8688607594936707,
"calib/mu_w": 0.4838541666666667,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16311023622047235,
"calib/std_conf": 0.41253229657299734,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.48749683944374206,
"calib/step_q_c_n": 791.0,
"calib/step_q_gap": 0.103942832475101,
"calib/step_q_w": 0.38355400696864106,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2842.0,
"completions/max_terminated_length": 2842.0,
"completions/mean_length": 471.3984375,
"completions/mean_terminated_length": 471.3984375,
"completions/min_length": 94.0,
"completions/min_terminated_length": 94.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.06835480779409409,
"kl": 0.07553863525390625,
"learning_rate": 1.944444444444445e-06,
"loss": -0.0023,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03732430934906006,
"mask/share_reasoning": 0.8383286595344543,
"mask/share_step_conf": 0.1243470311164856,
"num_tokens": 31295052.0,
"reward": 0.9831303358078003,
"reward_std": 0.14991185069084167,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/asymmetric_l2_reward": 0.8848813772201538,
"rewards/final_brier_reward_step": 0.7595043182373047,
"rewards/format_reward_step": 0.9921875,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.6311858892440796,
"adv/mean_abs_reasoning": 0.2988106906414032,
"adv/mean_abs_step_conf": 0.7364592552185059,
"adv/ratio_final_to_reasoning": 2.1123269983722013,
"adv/ratio_step_to_reasoning": 2.4646348952163697,
"adv/std_final_conf": 0.8326376676559448,
"adv/std_reasoning": 0.6184049248695374,
"adv/std_step_conf": 0.9334642291069031,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7967821142414061,
"calib/avg_num_step_conf": 5.234375,
"calib/ece": 0.2596825396825397,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5992063492063492,
"calib/gap": 0.44134401654174205,
"calib/mean_conf": 0.6542063492063492,
"calib/mu_c": 0.9099056603773585,
"calib/mu_w": 0.4685616438356165,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.24662698412698414,
"calib/std_conf": 0.43570204828566333,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4780145719489981,
"calib/step_q_c_n": 549.0,
"calib/step_q_gap": 0.13446210671511694,
"calib/step_q_w": 0.34355246523388117,
"calib/step_q_w_n": 791.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2566.0,
"completions/max_terminated_length": 2566.0,
"completions/mean_length": 479.078125,
"completions/mean_terminated_length": 479.078125,
"completions/min_length": 169.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.06330462545156479,
"kl": 0.06920623779296875,
"learning_rate": 1.916666666666667e-06,
"loss": -0.0458,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03542075306177139,
"mask/share_reasoning": 0.8427149057388306,
"mask/share_step_conf": 0.12186426669359207,
"num_tokens": 31523904.0,
"reward": 0.9323045015335083,
"reward_std": 0.1720176488161087,
"rewards/accuracy_reward_step": 0.4140625,
"rewards/asymmetric_l2_reward": 0.8692620396614075,
"rewards/final_brier_reward_step": 0.7156593799591064,
"rewards/format_reward_step": 0.984375,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.5419154167175293,
"adv/mean_abs_reasoning": 0.39009517431259155,
"adv/mean_abs_step_conf": 0.7669232487678528,
"adv/ratio_final_to_reasoning": 1.3891876967523853,
"adv/ratio_step_to_reasoning": 1.9659900949025861,
"adv/std_final_conf": 0.7634062170982361,
"adv/std_reasoning": 0.6612535119056702,
"adv/std_step_conf": 0.932680070400238,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7523074894514769,
"calib/avg_num_step_conf": 5.609375,
"calib/ece": 0.24881889763779533,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7007874015748031,
"calib/gap": 0.348175105485232,
"calib/mean_conf": 0.7357480314960629,
"calib/mu_c": 0.8673417721518987,
"calib/mu_w": 0.5191666666666667,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1812598425196851,
"calib/std_conf": 0.40682372449389625,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4939951573849879,
"calib/step_q_c_n": 826.0,
"calib/step_q_gap": 0.158306632794824,
"calib/step_q_w": 0.3356885245901639,
"calib/step_q_w_n": 610.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2357.0,
"completions/max_terminated_length": 2357.0,
"completions/mean_length": 509.984375,
"completions/mean_terminated_length": 509.984375,
"completions/min_length": 140.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.1408,
"grad_norm": 0.027949035167694092,
"kl": 0.0719757080078125,
"learning_rate": 1.888888888888889e-06,
"loss": 0.0022,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03615806996822357,
"mask/share_reasoning": 0.8362313508987427,
"mask/share_step_conf": 0.12761051952838898,
"num_tokens": 31760052.0,
"reward": 0.9794524908065796,
"reward_std": 0.16386666893959045,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/asymmetric_l2_reward": 0.8927135467529297,
"rewards/final_brier_reward_step": 0.7443163394927979,
"rewards/format_reward_step": 0.9921875,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.7018519043922424,
"adv/mean_abs_reasoning": 0.4902651607990265,
"adv/mean_abs_step_conf": 0.7454802989959717,
"adv/ratio_final_to_reasoning": 1.4315761357555474,
"adv/ratio_step_to_reasoning": 1.5205655196485908,
"adv/std_final_conf": 0.8919135332107544,
"adv/std_reasoning": 0.7752693295478821,
"adv/std_step_conf": 0.9336416721343994,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7900599270453361,
"calib/avg_num_step_conf": 6.7890625,
"calib/ece": 0.266600790513834,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5296442687747036,
"calib/gap": 0.39868290776446064,
"calib/mean_conf": 0.5900790513833992,
"calib/mu_c": 0.8296039603960396,
"calib/mu_w": 0.4309210526315789,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22873517786561268,
"calib/std_conf": 0.44594793302387903,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40628205128205136,
"calib/step_q_c_n": 702.0,
"calib/step_q_gap": 0.07311602811602819,
"calib/step_q_w": 0.3331660231660232,
"calib/step_q_w_n": 1036.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2896.0,
"completions/max_terminated_length": 2896.0,
"completions/mean_length": 595.953125,
"completions/mean_terminated_length": 598.2902221679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.05951628088951111,
"kl": 0.0603179931640625,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.0138,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.028608456254005432,
"mask/share_reasoning": 0.8479064702987671,
"mask/share_step_conf": 0.11957882344722748,
"num_tokens": 32018960.0,
"reward": 0.9371469020843506,
"reward_std": 0.21386194229125977,
"rewards/accuracy_reward_step": 0.39453125,
"rewards/asymmetric_l2_reward": 0.8900238871574402,
"rewards/final_brier_reward_step": 0.707707405090332,
"rewards/format_reward_step": 0.98828125,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.7257549166679382,
"adv/mean_abs_reasoning": 0.49450692534446716,
"adv/mean_abs_step_conf": 0.752888560295105,
"adv/ratio_final_to_reasoning": 1.4676334738131052,
"adv/ratio_step_to_reasoning": 1.522503572160597,
"adv/std_final_conf": 0.8783618807792664,
"adv/std_reasoning": 0.7394267320632935,
"adv/std_step_conf": 0.9331688284873962,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6966345244847797,
"calib/avg_num_step_conf": 5.9453125,
"calib/ece": 0.30107142857142855,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6111111111111112,
"calib/gap": 0.30402155416903004,
"calib/mean_conf": 0.6541666666666667,
"calib/mu_c": 0.8025581395348838,
"calib/mu_w": 0.4985365853658537,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22166666666666668,
"calib/std_conf": 0.4401944240359945,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.42251948051948046,
"calib/step_q_c_n": 770.0,
"calib/step_q_gap": 0.09624288477479959,
"calib/step_q_w": 0.3262765957446809,
"calib/step_q_w_n": 752.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2497.0,
"completions/max_terminated_length": 2497.0,
"completions/mean_length": 585.1484375,
"completions/mean_terminated_length": 587.4431762695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.02902313508093357,
"kl": 0.0628509521484375,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.0718,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.030042003840208054,
"mask/share_reasoning": 0.8552829027175903,
"mask/share_step_conf": 0.11076889932155609,
"num_tokens": 32277710.0,
"reward": 0.925879716873169,
"reward_std": 0.22603853046894073,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/asymmetric_l2_reward": 0.8767973184585571,
"rewards/final_brier_reward_step": 0.6773058176040649,
"rewards/format_reward_step": 0.984375,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.6498154997825623,
"adv/mean_abs_reasoning": 0.3868643045425415,
"adv/mean_abs_step_conf": 0.7391627430915833,
"adv/ratio_final_to_reasoning": 1.6796987784927708,
"adv/ratio_step_to_reasoning": 1.9106511880583732,
"adv/std_final_conf": 0.8298484683036804,
"adv/std_reasoning": 0.6614306569099426,
"adv/std_step_conf": 0.9338327646255493,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6963808760683761,
"calib/avg_num_step_conf": 6.21484375,
"calib/ece": 0.2699592741935484,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.625,
"calib/gap": 0.33560432692307685,
"calib/mean_conf": 0.6729439516129033,
"calib/mu_c": 0.8136812499999999,
"calib/mu_w": 0.47807692307692307,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18112903225806457,
"calib/std_conf": 0.4334902361093346,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4100843373493976,
"calib/step_q_c_n": 830.0,
"calib/step_q_gap": 0.08741679464243307,
"calib/step_q_w": 0.3226675427069645,
"calib/step_q_w_n": 761.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2746.0,
"completions/max_terminated_length": 2746.0,
"completions/mean_length": 573.79296875,
"completions/mean_terminated_length": 573.79296875,
"completions/min_length": 171.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.144,
"grad_norm": 0.03258584067225456,
"kl": 0.058887481689453125,
"learning_rate": 1.8055555555555557e-06,
"loss": 0.0267,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03373143821954727,
"mask/share_reasoning": 0.8400686383247375,
"mask/share_step_conf": 0.12619991600513458,
"num_tokens": 32530481.0,
"reward": 0.9410494565963745,
"reward_std": 0.21206125617027283,
"rewards/accuracy_reward_step": 0.5625,
"rewards/asymmetric_l2_reward": 0.8774411678314209,
"rewards/final_brier_reward_step": 0.6991890668869019,
"rewards/format_reward_step": 0.96484375,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.6550817489624023,
"adv/mean_abs_reasoning": 0.5279697775840759,
"adv/mean_abs_step_conf": 0.7136498093605042,
"adv/ratio_final_to_reasoning": 1.240756150778128,
"adv/ratio_step_to_reasoning": 1.3516868572024652,
"adv/std_final_conf": 0.8621448874473572,
"adv/std_reasoning": 0.7927603721618652,
"adv/std_step_conf": 0.9325664043426514,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7687752016129032,
"calib/avg_num_step_conf": 6.453125,
"calib/ece": 0.25138888888888883,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5317460317460317,
"calib/gap": 0.3933266129032257,
"calib/mean_conf": 0.5791666666666666,
"calib/mu_c": 0.7789516129032257,
"calib/mu_w": 0.385625,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16924603174603167,
"calib/std_conf": 0.4533461351483745,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.42105726872246696,
"calib/step_q_c_n": 681.0,
"calib/step_q_gap": 0.16384820590063381,
"calib/step_q_w": 0.25720906282183315,
"calib/step_q_w_n": 971.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2577.0,
"completions/max_terminated_length": 2577.0,
"completions/mean_length": 528.12890625,
"completions/mean_terminated_length": 530.2000122070312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.04807325452566147,
"kl": 0.07080078125,
"learning_rate": 1.777777777777778e-06,
"loss": 0.0226,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.032773517072200775,
"mask/share_reasoning": 0.830197274684906,
"mask/share_step_conf": 0.13312296569347382,
"num_tokens": 32774170.0,
"reward": 0.9568853974342346,
"reward_std": 0.19861853122711182,
"rewards/accuracy_reward_step": 0.484375,
"rewards/asymmetric_l2_reward": 0.9026017189025879,
"rewards/final_brier_reward_step": 0.7182003855705261,
"rewards/format_reward_step": 0.98046875,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.6266046762466431,
"adv/mean_abs_reasoning": 0.4091912508010864,
"adv/mean_abs_step_conf": 0.7550874352455139,
"adv/ratio_final_to_reasoning": 1.5313247167917683,
"adv/ratio_step_to_reasoning": 1.8453166673707118,
"adv/std_final_conf": 0.8365187644958496,
"adv/std_reasoning": 0.6816372275352478,
"adv/std_step_conf": 0.9328610301017761,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7627720120522263,
"calib/avg_num_step_conf": 6.84375,
"calib/ece": 0.22254032258064518,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6008064516129032,
"calib/gap": 0.42354670237696684,
"calib/mean_conf": 0.6422983870967742,
"calib/mu_c": 0.8182068965517242,
"calib/mu_w": 0.39466019417475734,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14008064516129035,
"calib/std_conf": 0.44154781788756414,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40336353340883346,
"calib/step_q_c_n": 883.0,
"calib/step_q_gap": 0.13861094422586456,
"calib/step_q_w": 0.2647525891829689,
"calib/step_q_w_n": 869.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3003.0,
"completions/max_terminated_length": 3003.0,
"completions/mean_length": 539.8125,
"completions/mean_terminated_length": 546.2134399414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.04263077676296234,
"kl": 0.0669097900390625,
"learning_rate": 1.75e-06,
"loss": -0.0123,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03168744221329689,
"mask/share_reasoning": 0.8275086879730225,
"mask/share_step_conf": 0.12908512353897095,
"num_tokens": 33019346.0,
"reward": 0.9610379934310913,
"reward_std": 0.19014747440814972,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/asymmetric_l2_reward": 0.8790087103843689,
"rewards/final_brier_reward_step": 0.7368171215057373,
"rewards/format_reward_step": 0.96484375,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.6476581692695618,
"adv/mean_abs_reasoning": 0.5334905385971069,
"adv/mean_abs_step_conf": 0.7643733620643616,
"adv/ratio_final_to_reasoning": 1.2140012285366404,
"adv/ratio_step_to_reasoning": 1.4327777284942962,
"adv/std_final_conf": 0.8558839559555054,
"adv/std_reasoning": 0.7754238247871399,
"adv/std_step_conf": 0.9321677684783936,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7955010224948875,
"calib/avg_num_step_conf": 5.7734375,
"calib/ece": 0.18932806324110668,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5968379446640316,
"calib/gap": 0.4857866394001363,
"calib/mean_conf": 0.6537549407114623,
"calib/mu_c": 0.8265644171779141,
"calib/mu_w": 0.3407777777777778,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.09940711462450587,
"calib/std_conf": 0.42963415269661276,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4080204778156996,
"calib/step_q_c_n": 879.0,
"calib/step_q_gap": 0.1297233158791387,
"calib/step_q_w": 0.2782971619365609,
"calib/step_q_w_n": 599.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2661.0,
"completions/max_terminated_length": 2661.0,
"completions/mean_length": 496.4921875,
"completions/mean_terminated_length": 498.4392395019531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.1472,
"grad_norm": 0.04252735897898674,
"kl": 0.07735443115234375,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.0624,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03474820777773857,
"mask/share_reasoning": 0.8353586792945862,
"mask/share_step_conf": 0.12598684430122375,
"num_tokens": 33250784.0,
"reward": 1.007904052734375,
"reward_std": 0.1891259253025055,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/asymmetric_l2_reward": 0.8937262892723083,
"rewards/final_brier_reward_step": 0.7978628873825073,
"rewards/format_reward_step": 0.984375,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.6581767797470093,
"adv/mean_abs_reasoning": 0.42416638135910034,
"adv/mean_abs_step_conf": 0.7499033212661743,
"adv/ratio_final_to_reasoning": 1.5516948270112787,
"adv/ratio_step_to_reasoning": 1.7679461509027616,
"adv/std_final_conf": 0.8394871950149536,
"adv/std_reasoning": 0.7013220191001892,
"adv/std_step_conf": 0.9318543672561646,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7805037191700379,
"calib/avg_num_step_conf": 5.53515625,
"calib/ece": 0.19721568627450975,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5450980392156862,
"calib/gap": 0.4600561137935535,
"calib/mean_conf": 0.6047450980392157,
"calib/mu_c": 0.779746835443038,
"calib/mu_w": 0.31969072164948453,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09117647058823526,
"calib/std_conf": 0.4437459016386166,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4019053398058252,
"calib/step_q_c_n": 824.0,
"calib/step_q_gap": 0.10600314756299212,
"calib/step_q_w": 0.2959021922428331,
"calib/step_q_w_n": 593.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2967.0,
"completions/max_terminated_length": 2967.0,
"completions/mean_length": 459.67578125,
"completions/mean_terminated_length": 459.67578125,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.07648757100105286,
"kl": 0.086883544921875,
"learning_rate": 1.6944444444444446e-06,
"loss": 0.0081,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03702244907617569,
"mask/share_reasoning": 0.8356660604476929,
"mask/share_step_conf": 0.12731149792671204,
"num_tokens": 33471557.0,
"reward": 1.0029046535491943,
"reward_std": 0.16366134583950043,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/asymmetric_l2_reward": 0.9031118750572205,
"rewards/final_brier_reward_step": 0.7808222770690918,
"rewards/format_reward_step": 0.9921875,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.5114564895629883,
"adv/mean_abs_reasoning": 0.33423930406570435,
"adv/mean_abs_step_conf": 0.7621335983276367,
"adv/ratio_final_to_reasoning": 1.5302104909315117,
"adv/ratio_step_to_reasoning": 2.280203402343781,
"adv/std_final_conf": 0.7727295160293579,
"adv/std_reasoning": 0.6610760688781738,
"adv/std_step_conf": 0.9312514066696167,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8144219396806622,
"calib/avg_num_step_conf": 5.375,
"calib/ece": 0.1538976377952756,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6850393700787402,
"calib/gap": 0.5125340035481964,
"calib/mean_conf": 0.7275984251968504,
"calib/mu_c": 0.8809550561797753,
"calib/mu_w": 0.3684210526315789,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0903543307086614,
"calib/std_conf": 0.4108740205767719,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.44658747300215984,
"calib/step_q_c_n": 926.0,
"calib/step_q_gap": 0.15103191744660427,
"calib/step_q_w": 0.29555555555555557,
"calib/step_q_w_n": 450.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2630.0,
"completions/max_terminated_length": 2630.0,
"completions/mean_length": 500.43359375,
"completions/mean_terminated_length": 500.43359375,
"completions/min_length": 133.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.041224293410778046,
"kl": 0.0767364501953125,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0673,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03436052054166794,
"mask/share_reasoning": 0.8470271825790405,
"mask/share_step_conf": 0.11861232668161392,
"num_tokens": 33704684.0,
"reward": 1.0392565727233887,
"reward_std": 0.12018037587404251,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/asymmetric_l2_reward": 0.9118223190307617,
"rewards/final_brier_reward_step": 0.8291909694671631,
"rewards/format_reward_step": 0.9921875,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.49427223205566406,
"adv/mean_abs_reasoning": 0.4047102928161621,
"adv/mean_abs_step_conf": 0.7349098920822144,
"adv/ratio_final_to_reasoning": 1.2212988916498475,
"adv/ratio_step_to_reasoning": 1.8158912810652037,
"adv/std_final_conf": 0.7135685682296753,
"adv/std_reasoning": 0.6816495060920715,
"adv/std_step_conf": 0.9325038194656372,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.881882576310793,
"calib/avg_num_step_conf": 6.0390625,
"calib/ece": 0.10734939759036147,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.642570281124498,
"calib/gap": 0.6617306849715201,
"calib/mean_conf": 0.6774698795180722,
"calib/mu_c": 0.895389221556886,
"calib/mu_w": 0.2336585365853659,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.057068273092369504,
"calib/std_conf": 0.43568395210389493,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.39104166666666673,
"calib/step_q_c_n": 1008.0,
"calib/step_q_gap": 0.12533534696406445,
"calib/step_q_w": 0.2657063197026023,
"calib/step_q_w_n": 538.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2210.0,
"completions/max_terminated_length": 2210.0,
"completions/mean_length": 538.55078125,
"completions/mean_terminated_length": 540.6627807617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.1504,
"grad_norm": 0.03272243216633797,
"kl": 0.063079833984375,
"learning_rate": 1.638888888888889e-06,
"loss": 0.0454,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.032417528331279755,
"mask/share_reasoning": 0.8407843112945557,
"mask/share_step_conf": 0.12289191037416458,
"num_tokens": 33949649.0,
"reward": 1.0384899377822876,
"reward_std": 0.14921121299266815,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/asymmetric_l2_reward": 0.8945099115371704,
"rewards/final_brier_reward_step": 0.857469916343689,
"rewards/format_reward_step": 0.97265625,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.6672189831733704,
"adv/mean_abs_reasoning": 0.5137597918510437,
"adv/mean_abs_step_conf": 0.7419067621231079,
"adv/ratio_final_to_reasoning": 1.2986983289786518,
"adv/ratio_step_to_reasoning": 1.4440732301180388,
"adv/std_final_conf": 0.8465292453765869,
"adv/std_reasoning": 0.7576342821121216,
"adv/std_step_conf": 0.9329071640968323,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7659574468085107,
"calib/avg_num_step_conf": 6.24609375,
"calib/ece": 0.2413095238095237,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5396825396825397,
"calib/gap": 0.40900709219858167,
"calib/mean_conf": 0.5888492063492063,
"calib/mu_c": 0.7690070921985817,
"calib/mu_w": 0.36,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1353174603174602,
"calib/std_conf": 0.45452646853028067,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3878585308056873,
"calib/step_q_c_n": 844.0,
"calib/step_q_gap": 0.09809694140171377,
"calib/step_q_w": 0.2897615894039735,
"calib/step_q_w_n": 755.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2580.0,
"completions/max_terminated_length": 2580.0,
"completions/mean_length": 547.94921875,
"completions/mean_terminated_length": 547.94921875,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.045452363789081573,
"kl": 0.07000732421875,
"learning_rate": 1.6111111111111113e-06,
"loss": 0.0256,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03237912803888321,
"mask/share_reasoning": 0.842139482498169,
"mask/share_step_conf": 0.12548136711120605,
"num_tokens": 34195084.0,
"reward": 0.9666558504104614,
"reward_std": 0.18624988198280334,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/asymmetric_l2_reward": 0.8902691602706909,
"rewards/final_brier_reward_step": 0.736011266708374,
"rewards/format_reward_step": 0.984375,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.5834546089172363,
"adv/mean_abs_reasoning": 0.4283401668071747,
"adv/mean_abs_step_conf": 0.7408386468887329,
"adv/ratio_final_to_reasoning": 1.3621291070278947,
"adv/ratio_step_to_reasoning": 1.7295567969049124,
"adv/std_final_conf": 0.8025461435317993,
"adv/std_reasoning": 0.7204607129096985,
"adv/std_step_conf": 0.9318289160728455,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8720285790598292,
"calib/avg_num_step_conf": 6.3671875,
"calib/ece": 0.1430645161290321,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6008064516129032,
"calib/gap": 0.5630608974358975,
"calib/mean_conf": 0.6624193548387097,
"calib/mu_c": 0.8985416666666668,
"calib/mu_w": 0.33548076923076925,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.11241935483870953,
"calib/std_conf": 0.4251126434551085,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4168470906630582,
"calib/step_q_c_n": 739.0,
"calib/step_q_gap": 0.19198738247001668,
"calib/step_q_w": 0.22485970819304152,
"calib/step_q_w_n": 891.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3056.0,
"completions/max_terminated_length": 3056.0,
"completions/mean_length": 567.859375,
"completions/mean_terminated_length": 567.859375,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.05021341145038605,
"kl": 0.06475830078125,
"learning_rate": 1.5833333333333333e-06,
"loss": 0.0437,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.0319594144821167,
"mask/share_reasoning": 0.8436037302017212,
"mask/share_step_conf": 0.12443678081035614,
"num_tokens": 34447792.0,
"reward": 1.0080523490905762,
"reward_std": 0.16858679056167603,
"rewards/accuracy_reward_step": 0.5625,
"rewards/asymmetric_l2_reward": 0.892905592918396,
"rewards/final_brier_reward_step": 0.8169492483139038,
"rewards/format_reward_step": 0.96875,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.5916790962219238,
"adv/mean_abs_reasoning": 0.46679311990737915,
"adv/mean_abs_step_conf": 0.7288160920143127,
"adv/ratio_final_to_reasoning": 1.2675403106612293,
"adv/ratio_step_to_reasoning": 1.5613256942581417,
"adv/std_final_conf": 0.8130344152450562,
"adv/std_reasoning": 0.7393408417701721,
"adv/std_step_conf": 0.9324739575386047,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7388970588235293,
"calib/avg_num_step_conf": 5.75,
"calib/ece": 0.24780000000000002,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6,
"calib/gap": 0.39191911764705895,
"calib/mean_conf": 0.63188,
"calib/mu_c": 0.7572941176470589,
"calib/mu_w": 0.36537499999999995,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.09984000000000003,
"calib/std_conf": 0.4510858738643896,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.3856683937823834,
"calib/step_q_c_n": 965.0,
"calib/step_q_gap": 0.09519502100131827,
"calib/step_q_w": 0.2904733727810651,
"calib/step_q_w_n": 507.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2804.0,
"completions/max_terminated_length": 2804.0,
"completions/mean_length": 542.84765625,
"completions/mean_terminated_length": 544.9765014648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.1536,
"grad_norm": 0.04553085193037987,
"kl": 0.0696868896484375,
"learning_rate": 1.5555555555555558e-06,
"loss": -0.0059,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.033757805824279785,
"mask/share_reasoning": 0.8401012420654297,
"mask/share_step_conf": 0.12223471701145172,
"num_tokens": 34690889.0,
"reward": 0.9717831611633301,
"reward_std": 0.16983790695667267,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/asymmetric_l2_reward": 0.8857837915420532,
"rewards/final_brier_reward_step": 0.7296574115753174,
"rewards/format_reward_step": 0.9765625,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.6282063722610474,
"adv/mean_abs_reasoning": 0.5565600395202637,
"adv/mean_abs_step_conf": 0.7417995929718018,
"adv/ratio_final_to_reasoning": 1.128730644770222,
"adv/ratio_step_to_reasoning": 1.3328294169506105,
"adv/std_final_conf": 0.8449404835700989,
"adv/std_reasoning": 0.8097800612449646,
"adv/std_step_conf": 0.9331483244895935,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6935672514619883,
"calib/avg_num_step_conf": 6.8046875,
"calib/ece": 0.2536254980079681,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7171314741035857,
"calib/gap": 0.255464912280702,
"calib/mean_conf": 0.7645418326693229,
"calib/mu_c": 0.8459649122807019,
"calib/mu_w": 0.5904999999999999,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16844621513944222,
"calib/std_conf": 0.3863622472481406,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40837301587301583,
"calib/step_q_c_n": 1008.0,
"calib/step_q_gap": 0.13369999134985505,
"calib/step_q_w": 0.2746730245231608,
"calib/step_q_w_n": 734.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2707.0,
"completions/max_terminated_length": 2707.0,
"completions/mean_length": 501.26953125,
"completions/mean_terminated_length": 503.2353210449219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.051369134336709976,
"kl": 0.08301544189453125,
"learning_rate": 1.527777777777778e-06,
"loss": 0.1476,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03623100742697716,
"mask/share_reasoning": 0.8176732063293457,
"mask/share_step_conf": 0.14218956232070923,
"num_tokens": 34921918.0,
"reward": 0.9706292152404785,
"reward_std": 0.20070935785770416,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/asymmetric_l2_reward": 0.888382613658905,
"rewards/final_brier_reward_step": 0.7231882810592651,
"rewards/format_reward_step": 0.98046875,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.679091215133667,
"adv/mean_abs_reasoning": 0.48408961296081543,
"adv/mean_abs_step_conf": 0.7339562177658081,
"adv/ratio_final_to_reasoning": 1.4028212895958914,
"adv/ratio_step_to_reasoning": 1.5161577487208304,
"adv/std_final_conf": 0.8739967942237854,
"adv/std_reasoning": 0.7575613856315613,
"adv/std_step_conf": 0.9337763786315918,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7498391248391247,
"calib/avg_num_step_conf": 5.89453125,
"calib/ece": 0.2546613545816732,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5976095617529881,
"calib/gap": 0.42930566280566274,
"calib/mean_conf": 0.6447808764940238,
"calib/mu_c": 0.8842342342342342,
"calib/mu_w": 0.45492857142857146,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22860557768924294,
"calib/std_conf": 0.43954187916360593,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.42036423841059606,
"calib/step_q_c_n": 604.0,
"calib/step_q_gap": 0.12298302294098279,
"calib/step_q_w": 0.29738121546961327,
"calib/step_q_w_n": 905.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2266.0,
"completions/max_terminated_length": 2266.0,
"completions/mean_length": 533.35546875,
"completions/mean_terminated_length": 535.4470825195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.03525509685277939,
"kl": 0.0661773681640625,
"learning_rate": 1.5e-06,
"loss": -0.069,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03156570717692375,
"mask/share_reasoning": 0.8492813110351562,
"mask/share_step_conf": 0.1152467131614685,
"num_tokens": 35165673.0,
"reward": 0.9413388967514038,
"reward_std": 0.20831407606601715,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/asymmetric_l2_reward": 0.8832393884658813,
"rewards/final_brier_reward_step": 0.7166258096694946,
"rewards/format_reward_step": 0.98046875,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.6118881702423096,
"adv/mean_abs_reasoning": 0.44582316279411316,
"adv/mean_abs_step_conf": 0.7450541257858276,
"adv/ratio_final_to_reasoning": 1.3724907571141327,
"adv/ratio_step_to_reasoning": 1.671187564854954,
"adv/std_final_conf": 0.8005033135414124,
"adv/std_reasoning": 0.7206440567970276,
"adv/std_step_conf": 0.9336183667182922,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6944263787721123,
"calib/avg_num_step_conf": 5.6640625,
"calib/ece": 0.2990725806451613,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6330645161290323,
"calib/gap": 0.32943548387096777,
"calib/mean_conf": 0.6661693548387098,
"calib/mu_c": 0.8308870967741936,
"calib/mu_w": 0.5014516129032258,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.23262096774193552,
"calib/std_conf": 0.44329803601898965,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.43249639249639243,
"calib/step_q_c_n": 693.0,
"calib/step_q_gap": 0.11671039513840037,
"calib/step_q_w": 0.31578599735799207,
"calib/step_q_w_n": 757.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2607.0,
"completions/max_terminated_length": 2607.0,
"completions/mean_length": 531.06640625,
"completions/mean_terminated_length": 535.248046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.1568,
"grad_norm": 0.04022669792175293,
"kl": 0.08133697509765625,
"learning_rate": 1.4722222222222225e-06,
"loss": 0.0078,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03174225613474846,
"mask/share_reasoning": 0.8448315262794495,
"mask/share_step_conf": 0.11561372131109238,
"num_tokens": 35405306.0,
"reward": 0.9095112681388855,
"reward_std": 0.21041998267173767,
"rewards/accuracy_reward_step": 0.484375,
"rewards/asymmetric_l2_reward": 0.8593860864639282,
"rewards/final_brier_reward_step": 0.6690112948417664,
"rewards/format_reward_step": 0.96875,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.524472713470459,
"adv/mean_abs_reasoning": 0.4304084777832031,
"adv/mean_abs_step_conf": 0.7623563408851624,
"adv/ratio_final_to_reasoning": 1.218546428666389,
"adv/ratio_step_to_reasoning": 1.7712391373228513,
"adv/std_final_conf": 0.7791754007339478,
"adv/std_reasoning": 0.7013992667198181,
"adv/std_step_conf": 0.9334371089935303,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.752401059778109,
"calib/avg_num_step_conf": 5.72265625,
"calib/ece": 0.2162248995983935,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.7469879518072289,
"calib/gap": 0.34049180327868855,
"calib/mean_conf": 0.773574297188755,
"calib/mu_c": 0.8638251366120219,
"calib/mu_w": 0.5233333333333333,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12742971887550195,
"calib/std_conf": 0.3913611243091092,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4259635666347075,
"calib/step_q_c_n": 1043.0,
"calib/step_q_gap": 0.13167446710864117,
"calib/step_q_w": 0.29428909952606636,
"calib/step_q_w_n": 422.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2980.0,
"completions/max_terminated_length": 2980.0,
"completions/mean_length": 516.078125,
"completions/mean_terminated_length": 520.1417236328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.0584280788898468,
"kl": 0.07340240478515625,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.0201,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03611718863248825,
"mask/share_reasoning": 0.8259831070899963,
"mask/share_step_conf": 0.13008719682693481,
"num_tokens": 35642534.0,
"reward": 0.9886473417282104,
"reward_std": 0.18284042179584503,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/asymmetric_l2_reward": 0.8780118227005005,
"rewards/final_brier_reward_step": 0.7617827653884888,
"rewards/format_reward_step": 0.97265625,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.566154956817627,
"adv/mean_abs_reasoning": 0.4284716844558716,
"adv/mean_abs_step_conf": 0.7544533014297485,
"adv/ratio_final_to_reasoning": 1.3213357553290908,
"adv/ratio_step_to_reasoning": 1.7608008388882228,
"adv/std_final_conf": 0.7966720461845398,
"adv/std_reasoning": 0.7013741135597229,
"adv/std_step_conf": 0.9313024282455444,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8105348988910633,
"calib/avg_num_step_conf": 6.25390625,
"calib/ece": 0.16796812749003978,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7091633466135459,
"calib/gap": 0.5252994129158515,
"calib/mean_conf": 0.7432669322709163,
"calib/mu_c": 0.9630136986301372,
"calib/mu_w": 0.4377142857142857,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16478087649402384,
"calib/std_conf": 0.4068463942424941,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4335380835380835,
"calib/step_q_c_n": 814.0,
"calib/step_q_gap": 0.15961178112385216,
"calib/step_q_w": 0.2739263024142313,
"calib/step_q_w_n": 787.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2226.0,
"completions/max_terminated_length": 2226.0,
"completions/mean_length": 548.18359375,
"completions/mean_terminated_length": 550.3333740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.0576673299074173,
"kl": 0.062652587890625,
"learning_rate": 1.4166666666666667e-06,
"loss": -0.0267,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03428906947374344,
"mask/share_reasoning": 0.8384957313537598,
"mask/share_step_conf": 0.12330888956785202,
"num_tokens": 35887325.0,
"reward": 1.0077321529388428,
"reward_std": 0.17122933268547058,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/asymmetric_l2_reward": 0.9006611108779907,
"rewards/final_brier_reward_step": 0.8046468496322632,
"rewards/format_reward_step": 0.98046875,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.6269816160202026,
"adv/mean_abs_reasoning": 0.46332746744155884,
"adv/mean_abs_step_conf": 0.7745290994644165,
"adv/ratio_final_to_reasoning": 1.3532148643859239,
"adv/ratio_step_to_reasoning": 1.6716667020439722,
"adv/std_final_conf": 0.8205464482307434,
"adv/std_reasoning": 0.720609188079834,
"adv/std_step_conf": 0.9327038526535034,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7517542652724271,
"calib/avg_num_step_conf": 5.65625,
"calib/ece": 0.24335999999999997,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.732,
"calib/gap": 0.3722977435332966,
"calib/mean_conf": 0.7546400000000001,
"calib/mu_c": 0.8916455696202531,
"calib/mu_w": 0.5193478260869565,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18299999999999997,
"calib/std_conf": 0.41221071116602487,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.42653179190751445,
"calib/step_q_c_n": 865.0,
"calib/step_q_gap": 0.10047690339979576,
"calib/step_q_w": 0.3260548885077187,
"calib/step_q_w_n": 583.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2716.0,
"completions/max_terminated_length": 2716.0,
"completions/mean_length": 478.453125,
"completions/mean_terminated_length": 478.453125,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.16,
"grad_norm": 0.025151947513222694,
"kl": 0.0756683349609375,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.0234,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.038418009877204895,
"mask/share_reasoning": 0.8237306475639343,
"mask/share_step_conf": 0.1378513127565384,
"num_tokens": 36114769.0,
"reward": 0.9638096690177917,
"reward_std": 0.19389088451862335,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/asymmetric_l2_reward": 0.8756263256072998,
"rewards/final_brier_reward_step": 0.734024167060852,
"rewards/format_reward_step": 0.97265625,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.6179725527763367,
"adv/mean_abs_reasoning": 0.427360862493515,
"adv/mean_abs_step_conf": 0.7608951330184937,
"adv/ratio_final_to_reasoning": 1.4460204642293704,
"adv/ratio_step_to_reasoning": 1.7804511357893467,
"adv/std_final_conf": 0.8229371905326843,
"adv/std_reasoning": 0.7013833522796631,
"adv/std_step_conf": 0.9335458874702454,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.8082643515714383,
"calib/avg_num_step_conf": 6.76171875,
"calib/ece": 0.22418032786885236,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.569672131147541,
"calib/gap": 0.5015182717544922,
"calib/mean_conf": 0.6102459016393442,
"calib/mu_c": 0.8712820512820512,
"calib/mu_w": 0.36976377952755907,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17745901639344253,
"calib/std_conf": 0.4581940012822868,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.40887905604719765,
"calib/step_q_c_n": 678.0,
"calib/step_q_gap": 0.17155711872526036,
"calib/step_q_w": 0.2373219373219373,
"calib/step_q_w_n": 1053.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2391.0,
"completions/max_terminated_length": 2391.0,
"completions/mean_length": 582.28515625,
"completions/mean_terminated_length": 591.52783203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.031244348734617233,
"kl": 0.061309814453125,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.0475,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.030964188277721405,
"mask/share_reasoning": 0.8384581804275513,
"mask/share_step_conf": 0.11495261639356613,
"num_tokens": 36370858.0,
"reward": 0.9330952167510986,
"reward_std": 0.18629847466945648,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/asymmetric_l2_reward": 0.8467037081718445,
"rewards/final_brier_reward_step": 0.7374554872512817,
"rewards/format_reward_step": 0.953125,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.6945838928222656,
"adv/mean_abs_reasoning": 0.5879529118537903,
"adv/mean_abs_step_conf": 0.7506691217422485,
"adv/ratio_final_to_reasoning": 1.1813597293570202,
"adv/ratio_step_to_reasoning": 1.276750410803173,
"adv/std_final_conf": 0.8834863901138306,
"adv/std_reasoning": 0.8265911340713501,
"adv/std_step_conf": 0.9328155517578125,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7469903038979632,
"calib/avg_num_step_conf": 6.3828125,
"calib/ece": 0.3006854838709677,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5967741935483871,
"calib/gap": 0.33690440554434803,
"calib/mean_conf": 0.6297177419354838,
"calib/mu_c": 0.7940944881889762,
"calib/mu_w": 0.4571900826446282,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2091532258064516,
"calib/std_conf": 0.45284085469977603,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.3859042553191489,
"calib/step_q_c_n": 752.0,
"calib/step_q_gap": 0.10618770203116701,
"calib/step_q_w": 0.2797165532879819,
"calib/step_q_w_n": 882.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2350.0,
"completions/max_terminated_length": 2350.0,
"completions/mean_length": 544.1328125,
"completions/mean_terminated_length": 548.4172973632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.04397697374224663,
"kl": 0.09905242919921875,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.0666,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.033129818737506866,
"mask/share_reasoning": 0.8305137157440186,
"mask/share_step_conf": 0.1285439282655716,
"num_tokens": 36615548.0,
"reward": 0.9151645302772522,
"reward_std": 0.22851672768592834,
"rewards/accuracy_reward_step": 0.5,
"rewards/asymmetric_l2_reward": 0.8679298162460327,
"rewards/final_brier_reward_step": 0.6702117323875427,
"rewards/format_reward_step": 0.9609375,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.6541010141372681,
"adv/mean_abs_reasoning": 0.4219273328781128,
"adv/mean_abs_step_conf": 0.7459571361541748,
"adv/ratio_final_to_reasoning": 1.550269354856482,
"adv/ratio_step_to_reasoning": 1.7679753787595183,
"adv/std_final_conf": 0.8594304323196411,
"adv/std_reasoning": 0.7204716801643372,
"adv/std_step_conf": 0.932479202747345,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6297800673667524,
"calib/avg_num_step_conf": 5.5625,
"calib/ece": 0.3261200000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.648,
"calib/gap": 0.2313539396341061,
"calib/mean_conf": 0.68924,
"calib/mu_c": 0.7845578231292518,
"calib/mu_w": 0.5532038834951457,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.21368000000000006,
"calib/std_conf": 0.43288130289953614,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4063384188626907,
"calib/step_q_c_n": 721.0,
"calib/step_q_gap": 0.13020755115287558,
"calib/step_q_w": 0.2761308677098151,
"calib/step_q_w_n": 703.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3011.0,
"completions/max_terminated_length": 3011.0,
"completions/mean_length": 520.55078125,
"completions/mean_terminated_length": 522.5921630859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.1632,
"grad_norm": 0.0666937530040741,
"kl": 0.0721282958984375,
"learning_rate": 1.3055555555555556e-06,
"loss": 0.0571,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03248149901628494,
"mask/share_reasoning": 0.8496717214584351,
"mask/share_step_conf": 0.11394055187702179,
"num_tokens": 36856129.0,
"reward": 0.9188251495361328,
"reward_std": 0.19821104407310486,
"rewards/accuracy_reward_step": 0.578125,
"rewards/asymmetric_l2_reward": 0.8702654838562012,
"rewards/final_brier_reward_step": 0.6564472913742065,
"rewards/format_reward_step": 0.9765625,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.593313455581665,
"adv/mean_abs_reasoning": 0.40208834409713745,
"adv/mean_abs_step_conf": 0.7636384963989258,
"adv/ratio_final_to_reasoning": 1.4755798428176545,
"adv/ratio_step_to_reasoning": 1.8991808830311285,
"adv/std_final_conf": 0.8072009682655334,
"adv/std_reasoning": 0.6816769242286682,
"adv/std_step_conf": 0.9328876733779907,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7256451612903225,
"calib/avg_num_step_conf": 5.28515625,
"calib/ece": 0.2687550200803214,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6024096385542169,
"calib/gap": 0.3908619354838709,
"calib/mean_conf": 0.6415261044176708,
"calib/mu_c": 0.8377419354838709,
"calib/mu_w": 0.44688,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2061445783132531,
"calib/std_conf": 0.4517632571194984,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4189130434782608,
"calib/step_q_c_n": 644.0,
"calib/step_q_gap": 0.12159287422579251,
"calib/step_q_w": 0.2973201692524683,
"calib/step_q_w_n": 709.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2620.0,
"completions/max_terminated_length": 2620.0,
"completions/mean_length": 517.84375,
"completions/mean_terminated_length": 519.87451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.04594266042113304,
"kl": 0.075714111328125,
"learning_rate": 1.2777777777777779e-06,
"loss": 0.0992,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.034510090947151184,
"mask/share_reasoning": 0.8454740047454834,
"mask/share_step_conf": 0.11610963195562363,
"num_tokens": 37093137.0,
"reward": 0.9312186241149902,
"reward_std": 0.18028542399406433,
"rewards/accuracy_reward_step": 0.484375,
"rewards/asymmetric_l2_reward": 0.8699989318847656,
"rewards/final_brier_reward_step": 0.7010320425033569,
"rewards/format_reward_step": 0.97265625,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.6469031572341919,
"adv/mean_abs_reasoning": 0.47271543741226196,
"adv/mean_abs_step_conf": 0.7635318040847778,
"adv/ratio_final_to_reasoning": 1.3684832481364857,
"adv/ratio_step_to_reasoning": 1.6152038703548637,
"adv/std_final_conf": 0.8352489471435547,
"adv/std_reasoning": 0.7392789125442505,
"adv/std_step_conf": 0.9331855773925781,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7134146341463414,
"calib/avg_num_step_conf": 5.77734375,
"calib/ece": 0.29729411764705893,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5058823529411764,
"calib/gap": 0.32256097560975616,
"calib/mean_conf": 0.5472549019607844,
"calib/mu_c": 0.7142276422764228,
"calib/mu_w": 0.3916666666666666,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18109803921568635,
"calib/std_conf": 0.46258641346990637,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.36075822603719593,
"calib/step_q_c_n": 699.0,
"calib/step_q_gap": 0.08251463629360617,
"calib/step_q_w": 0.27824358974358976,
"calib/step_q_w_n": 780.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2952.0,
"completions/max_terminated_length": 2952.0,
"completions/mean_length": 478.0625,
"completions/mean_terminated_length": 478.0625,
"completions/min_length": 139.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.0336977019906044,
"kl": 0.08032989501953125,
"learning_rate": 1.25e-06,
"loss": 0.0138,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03487030789256096,
"mask/share_reasoning": 0.8355661034584045,
"mask/share_step_conf": 0.12956362962722778,
"num_tokens": 37322737.0,
"reward": 0.9273616671562195,
"reward_std": 0.1708342432975769,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/asymmetric_l2_reward": 0.8782615661621094,
"rewards/final_brier_reward_step": 0.6827117204666138,
"rewards/format_reward_step": 0.98828125,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.6737991571426392,
"adv/mean_abs_reasoning": 0.4827231764793396,
"adv/mean_abs_step_conf": 0.7515172958374023,
"adv/ratio_final_to_reasoning": 1.3958293075067996,
"adv/ratio_step_to_reasoning": 1.5568287011169994,
"adv/std_final_conf": 0.8648738265037537,
"adv/std_reasoning": 0.7206152677536011,
"adv/std_step_conf": 0.9330052733421326,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7186783804430862,
"calib/avg_num_step_conf": 5.82421875,
"calib/ece": 0.25689243027888453,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5577689243027888,
"calib/gap": 0.3968054494525084,
"calib/mean_conf": 0.6011155378486056,
"calib/mu_c": 0.7892424242424243,
"calib/mu_w": 0.3924369747899159,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1660557768924303,
"calib/std_conf": 0.4584387486378078,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.3498605830164765,
"calib/step_q_c_n": 789.0,
"calib/step_q_gap": 0.0351169932728867,
"calib/step_q_w": 0.3147435897435898,
"calib/step_q_w_n": 702.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2925.0,
"completions/max_terminated_length": 2925.0,
"completions/mean_length": 514.51171875,
"completions/mean_terminated_length": 514.51171875,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.1664,
"grad_norm": 0.04300342872738838,
"kl": 0.08170700073242188,
"learning_rate": 1.2222222222222223e-06,
"loss": -0.0075,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03448965772986412,
"mask/share_reasoning": 0.8360552787780762,
"mask/share_step_conf": 0.129455104470253,
"num_tokens": 37559212.0,
"reward": 0.9496381282806396,
"reward_std": 0.16709250211715698,
"rewards/accuracy_reward_step": 0.515625,
"rewards/asymmetric_l2_reward": 0.8816511631011963,
"rewards/final_brier_reward_step": 0.7184062004089355,
"rewards/format_reward_step": 0.98046875,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.5912591218948364,
"adv/mean_abs_reasoning": 0.49544456601142883,
"adv/mean_abs_step_conf": 0.7444217205047607,
"adv/ratio_final_to_reasoning": 1.1933910722944479,
"adv/ratio_step_to_reasoning": 1.5025328191561769,
"adv/std_final_conf": 0.82242351770401,
"adv/std_reasoning": 0.7575790882110596,
"adv/std_step_conf": 0.933133602142334,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.790162701668034,
"calib/avg_num_step_conf": 6.3203125,
"calib/ece": 0.20235059760956173,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6135458167330677,
"calib/gap": 0.49839896089691005,
"calib/mean_conf": 0.6459362549800797,
"calib/mu_c": 0.8286163522012578,
"calib/mu_w": 0.3302173913043478,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10741035856573701,
"calib/std_conf": 0.45350540384328225,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3897894736842106,
"calib/step_q_c_n": 1045.0,
"calib/step_q_gap": 0.11057481399834668,
"calib/step_q_w": 0.2792146596858639,
"calib/step_q_w_n": 573.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2485.0,
"completions/max_terminated_length": 2485.0,
"completions/mean_length": 497.61328125,
"completions/mean_terminated_length": 501.531494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.04095279052853584,
"kl": 0.0734100341796875,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.0316,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03470167517662048,
"mask/share_reasoning": 0.8203328847885132,
"mask/share_step_conf": 0.13715294003486633,
"num_tokens": 37790329.0,
"reward": 0.9915717244148254,
"reward_std": 0.1898421347141266,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/asymmetric_l2_reward": 0.8848938345909119,
"rewards/final_brier_reward_step": 0.7779370546340942,
"rewards/format_reward_step": 0.98046875,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.5748130083084106,
"adv/mean_abs_reasoning": 0.41429877281188965,
"adv/mean_abs_step_conf": 0.7429898977279663,
"adv/ratio_final_to_reasoning": 1.3874359424409923,
"adv/ratio_step_to_reasoning": 1.7933673630873082,
"adv/std_final_conf": 0.7997155785560608,
"adv/std_reasoning": 0.6816416382789612,
"adv/std_step_conf": 0.9338192939758301,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7263106796116505,
"calib/avg_num_step_conf": 5.40234375,
"calib/ece": 0.27177865612648217,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6086956521739131,
"calib/gap": 0.3565825242718448,
"calib/mean_conf": 0.6508300395256919,
"calib/mu_c": 0.7960000000000002,
"calib/mu_w": 0.43941747572815537,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16486166007905134,
"calib/std_conf": 0.4487590517510003,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42103761348897534,
"calib/step_q_c_n": 771.0,
"calib/step_q_gap": 0.1080310775412629,
"calib/step_q_w": 0.31300653594771244,
"calib/step_q_w_n": 612.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2833.0,
"completions/max_terminated_length": 2833.0,
"completions/mean_length": 500.37109375,
"completions/mean_terminated_length": 500.37109375,
"completions/min_length": 131.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.05039157345890999,
"kl": 0.0789794921875,
"learning_rate": 1.1666666666666668e-06,
"loss": 0.0354,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.037866123020648956,
"mask/share_reasoning": 0.836431622505188,
"mask/share_step_conf": 0.12570224702358246,
"num_tokens": 38023664.0,
"reward": 0.9502644538879395,
"reward_std": 0.16423243284225464,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/asymmetric_l2_reward": 0.8681695461273193,
"rewards/final_brier_reward_step": 0.7175155878067017,
"rewards/format_reward_step": 0.98828125,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.6193236708641052,
"adv/mean_abs_reasoning": 0.4926747977733612,
"adv/mean_abs_step_conf": 0.7591203451156616,
"adv/ratio_final_to_reasoning": 1.2570638353395227,
"adv/ratio_step_to_reasoning": 1.540814242064945,
"adv/std_final_conf": 0.815812349319458,
"adv/std_reasoning": 0.7574940323829651,
"adv/std_step_conf": 0.9329251050949097,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6892206410426918,
"calib/avg_num_step_conf": 5.359375,
"calib/ece": 0.3099196787148594,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5140562248995983,
"calib/gap": 0.3123513765128343,
"calib/mean_conf": 0.5535341365461848,
"calib/mu_c": 0.6827397260273973,
"calib/mu_w": 0.370388349514563,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13855421686746988,
"calib/std_conf": 0.46807771451615177,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42608767123287666,
"calib/step_q_c_n": 730.0,
"calib/step_q_gap": 0.1379412537873938,
"calib/step_q_w": 0.28814641744548286,
"calib/step_q_w_n": 642.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3024.0,
"completions/max_terminated_length": 3024.0,
"completions/mean_length": 501.59375,
"completions/mean_terminated_length": 501.59375,
"completions/min_length": 130.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.1696,
"grad_norm": 0.049436114728450775,
"kl": 0.080230712890625,
"learning_rate": 1.138888888888889e-06,
"loss": 0.0599,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03590218350291252,
"mask/share_reasoning": 0.8452440500259399,
"mask/share_step_conf": 0.11885374784469604,
"num_tokens": 38256856.0,
"reward": 0.9326581358909607,
"reward_std": 0.1965138167142868,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/asymmetric_l2_reward": 0.8867565393447876,
"rewards/final_brier_reward_step": 0.6699659824371338,
"rewards/format_reward_step": 0.97265625,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.651262104511261,
"adv/mean_abs_reasoning": 0.4572453498840332,
"adv/mean_abs_step_conf": 0.7492605447769165,
"adv/ratio_final_to_reasoning": 1.4243165177654282,
"adv/ratio_step_to_reasoning": 1.6386400538943575,
"adv/std_final_conf": 0.8251065611839294,
"adv/std_reasoning": 0.7206948399543762,
"adv/std_step_conf": 0.933678925037384,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7278619864379737,
"calib/avg_num_step_conf": 5.7578125,
"calib/ece": 0.30607287449392717,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5060728744939271,
"calib/gap": 0.35182289589150384,
"calib/mean_conf": 0.5299595141700404,
"calib/mu_c": 0.6852173913043479,
"calib/mu_w": 0.33339449541284405,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.13866396761133604,
"calib/std_conf": 0.4773497160335752,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.40606488011283504,
"calib/step_q_c_n": 709.0,
"calib/step_q_gap": 0.13901912847884806,
"calib/step_q_w": 0.26704575163398697,
"calib/step_q_w_n": 765.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2483.0,
"completions/max_terminated_length": 2483.0,
"completions/mean_length": 499.25,
"completions/mean_terminated_length": 501.2078857421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.03691410645842552,
"kl": 0.0764007568359375,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0152,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03438437357544899,
"mask/share_reasoning": 0.8391258716583252,
"mask/share_step_conf": 0.12258350849151611,
"num_tokens": 38489504.0,
"reward": 0.9193039536476135,
"reward_std": 0.19498516619205475,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/asymmetric_l2_reward": 0.8641306161880493,
"rewards/final_brier_reward_step": 0.6736960411071777,
"rewards/format_reward_step": 0.96484375,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.5909967422485352,
"adv/mean_abs_reasoning": 0.3688772916793823,
"adv/mean_abs_step_conf": 0.748254120349884,
"adv/ratio_final_to_reasoning": 1.602149971221901,
"adv/ratio_step_to_reasoning": 2.028463495118711,
"adv/std_final_conf": 0.8158046007156372,
"adv/std_reasoning": 0.6814883947372437,
"adv/std_step_conf": 0.9323292374610901,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7071225071225071,
"calib/avg_num_step_conf": 5.46484375,
"calib/ece": 0.32039370078740165,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5118110236220472,
"calib/gap": 0.3662433862433863,
"calib/mean_conf": 0.5425196850393701,
"calib/mu_c": 0.6362433862433863,
"calib/mu_w": 0.26999999999999996,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05940944881889766,
"calib/std_conf": 0.47380262428341774,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.38986328125,
"calib/step_q_c_n": 1024.0,
"calib/step_q_gap": 0.11586328125,
"calib/step_q_w": 0.274,
"calib/step_q_w_n": 375.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2926.0,
"completions/max_terminated_length": 2926.0,
"completions/mean_length": 456.0859375,
"completions/mean_terminated_length": 456.0859375,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.10700822621583939,
"kl": 0.079345703125,
"learning_rate": 1.0833333333333335e-06,
"loss": 0.0686,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03930460289120674,
"mask/share_reasoning": 0.8258973360061646,
"mask/share_step_conf": 0.1347980797290802,
"num_tokens": 38710182.0,
"reward": 0.9591859579086304,
"reward_std": 0.14740516245365143,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/asymmetric_l2_reward": 0.8936820030212402,
"rewards/final_brier_reward_step": 0.6785961389541626,
"rewards/format_reward_step": 0.9921875,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.5586308240890503,
"adv/mean_abs_reasoning": 0.4160280227661133,
"adv/mean_abs_step_conf": 0.7616361975669861,
"adv/ratio_final_to_reasoning": 1.3427721055297923,
"adv/ratio_step_to_reasoning": 1.8307329215541093,
"adv/std_final_conf": 0.7841950058937073,
"adv/std_reasoning": 0.701278567314148,
"adv/std_step_conf": 0.9325926899909973,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8157858707557503,
"calib/avg_num_step_conf": 5.265625,
"calib/ece": 0.20877952755905524,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5669291338582677,
"calib/gap": 0.5165840635268346,
"calib/mean_conf": 0.6009055118110236,
"calib/mu_c": 0.7798795180722891,
"calib/mu_w": 0.2632954545454545,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.07807086614173239,
"calib/std_conf": 0.4637407457415819,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4075116279069767,
"calib/step_q_c_n": 860.0,
"calib/step_q_gap": 0.1085157262676324,
"calib/step_q_w": 0.2989959016393443,
"calib/step_q_w_n": 488.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1455.0,
"completions/max_terminated_length": 1455.0,
"completions/mean_length": 442.27734375,
"completions/mean_terminated_length": 444.01177978515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.1728,
"grad_norm": 0.08423297107219696,
"kl": 0.08339691162109375,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.0107,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03754296526312828,
"mask/share_reasoning": 0.8310139179229736,
"mask/share_step_conf": 0.1275368630886078,
"num_tokens": 38927549.0,
"reward": 0.999754786491394,
"reward_std": 0.14653810858726501,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/asymmetric_l2_reward": 0.8925575613975525,
"rewards/final_brier_reward_step": 0.7796082496643066,
"rewards/format_reward_step": 0.98828125,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.5742782354354858,
"adv/mean_abs_reasoning": 0.4988449811935425,
"adv/mean_abs_step_conf": 0.7395628690719604,
"adv/ratio_final_to_reasoning": 1.1512158227220426,
"adv/ratio_step_to_reasoning": 1.4825504855285372,
"adv/std_final_conf": 0.8037339448928833,
"adv/std_reasoning": 0.7575864791870117,
"adv/std_step_conf": 0.9324069023132324,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7783159227603672,
"calib/avg_num_step_conf": 6.00390625,
"calib/ece": 0.22789682539682532,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.46825396825396826,
"calib/gap": 0.4833561253561253,
"calib/mean_conf": 0.5043253968253968,
"calib/mu_c": 0.7287407407407407,
"calib/mu_w": 0.24538461538461537,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.09825396825396818,
"calib/std_conf": 0.47483825699640314,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4153507565337002,
"calib/step_q_c_n": 727.0,
"calib/step_q_gap": 0.15149890468184835,
"calib/step_q_w": 0.26385185185185184,
"calib/step_q_w_n": 810.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2386.0,
"completions/max_terminated_length": 2386.0,
"completions/mean_length": 545.7578125,
"completions/mean_terminated_length": 547.8980712890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.02793508768081665,
"kl": 0.07421112060546875,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.0248,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03470218926668167,
"mask/share_reasoning": 0.8333349227905273,
"mask/share_step_conf": 0.1280566155910492,
"num_tokens": 39172095.0,
"reward": 0.9705761671066284,
"reward_std": 0.16870234906673431,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/asymmetric_l2_reward": 0.8863610029220581,
"rewards/final_brier_reward_step": 0.7532289028167725,
"rewards/format_reward_step": 0.98046875,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.6500420570373535,
"adv/mean_abs_reasoning": 0.44920510053634644,
"adv/mean_abs_step_conf": 0.7497342824935913,
"adv/ratio_final_to_reasoning": 1.4470941141612368,
"adv/ratio_step_to_reasoning": 1.6690244202445965,
"adv/std_final_conf": 0.8430293202400208,
"adv/std_reasoning": 0.7015290856361389,
"adv/std_step_conf": 0.9337033033370972,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.770204987596292,
"calib/avg_num_step_conf": 5.8984375,
"calib/ece": 0.25626506024096396,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5301204819277109,
"calib/gap": 0.42616921269095165,
"calib/mean_conf": 0.5546586345381527,
"calib/mu_c": 0.7446376811594202,
"calib/mu_w": 0.31846846846846855,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12835341365461853,
"calib/std_conf": 0.47459640847724605,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3912232030264817,
"calib/step_q_c_n": 793.0,
"calib/step_q_gap": 0.10328735923289734,
"calib/step_q_w": 0.2879358437935844,
"calib/step_q_w_n": 717.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2351.0,
"completions/max_terminated_length": 2351.0,
"completions/mean_length": 551.2109375,
"completions/mean_terminated_length": 557.7470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.06633436679840088,
"kl": 0.07451629638671875,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0535,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.030875276774168015,
"mask/share_reasoning": 0.8452262282371521,
"mask/share_step_conf": 0.11217975616455078,
"num_tokens": 39419341.0,
"reward": 0.9514139890670776,
"reward_std": 0.2097827047109604,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/asymmetric_l2_reward": 0.8823947906494141,
"rewards/final_brier_reward_step": 0.7180894613265991,
"rewards/format_reward_step": 0.97265625,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.5945655107498169,
"adv/mean_abs_reasoning": 0.49014484882354736,
"adv/mean_abs_step_conf": 0.7411804795265198,
"adv/ratio_final_to_reasoning": 1.2130404148424725,
"adv/ratio_step_to_reasoning": 1.5121662123054271,
"adv/std_final_conf": 0.8082362413406372,
"adv/std_reasoning": 0.7575728297233582,
"adv/std_step_conf": 0.9333819150924683,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7232782898105479,
"calib/avg_num_step_conf": 5.5,
"calib/ece": 0.2599600000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.572,
"calib/gap": 0.4444726062467999,
"calib/mean_conf": 0.60276,
"calib/mu_c": 0.8267741935483872,
"calib/mu_w": 0.3823015873015873,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18336000000000005,
"calib/std_conf": 0.47151583472880315,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4217050691244239,
"calib/step_q_c_n": 651.0,
"calib/step_q_gap": 0.1091423214361808,
"calib/step_q_w": 0.3125627476882431,
"calib/step_q_w_n": 757.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2518.0,
"completions/max_terminated_length": 2518.0,
"completions/mean_length": 535.36328125,
"completions/mean_terminated_length": 535.36328125,
"completions/min_length": 126.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.176,
"grad_norm": 0.06460738927125931,
"kl": 0.12464141845703125,
"learning_rate": 9.722222222222224e-07,
"loss": -0.0664,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03398456051945686,
"mask/share_reasoning": 0.8485789895057678,
"mask/share_step_conf": 0.11743646115064621,
"num_tokens": 39661970.0,
"reward": 0.9417548775672913,
"reward_std": 0.1901148110628128,
"rewards/accuracy_reward_step": 0.484375,
"rewards/asymmetric_l2_reward": 0.8748019337654114,
"rewards/final_brier_reward_step": 0.7173015475273132,
"rewards/format_reward_step": 0.97265625,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.4994466304779053,
"adv/mean_abs_reasoning": 0.37650659680366516,
"adv/mean_abs_step_conf": 0.7547671794891357,
"adv/ratio_final_to_reasoning": 1.326528232753247,
"adv/ratio_step_to_reasoning": 2.004658579415861,
"adv/std_final_conf": 0.7379820942878723,
"adv/std_reasoning": 0.6612639427185059,
"adv/std_step_conf": 0.9327965378761292,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8125730994152047,
"calib/avg_num_step_conf": 6.0859375,
"calib/ece": 0.18629482071713144,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6374501992031872,
"calib/gap": 0.5553486842105264,
"calib/mean_conf": 0.6624701195219124,
"calib/mu_c": 0.8394736842105264,
"calib/mu_w": 0.284125,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.08374501992031869,
"calib/std_conf": 0.4574895489015776,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4180082559339525,
"calib/step_q_c_n": 969.0,
"calib/step_q_gap": 0.15486733912580308,
"calib/step_q_w": 0.26314091680814944,
"calib/step_q_w_n": 589.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2201.0,
"completions/max_terminated_length": 2201.0,
"completions/mean_length": 531.1875,
"completions/mean_terminated_length": 533.2706298828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.0442763976752758,
"kl": 0.06806182861328125,
"learning_rate": 9.444444444444445e-07,
"loss": -0.0328,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.032991744577884674,
"mask/share_reasoning": 0.8350811004638672,
"mask/share_step_conf": 0.12802088260650635,
"num_tokens": 39904138.0,
"reward": 1.0148489475250244,
"reward_std": 0.15825651586055756,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/asymmetric_l2_reward": 0.9015299081802368,
"rewards/final_brier_reward_step": 0.7984804511070251,
"rewards/format_reward_step": 0.98046875,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.48053890466690063,
"adv/mean_abs_reasoning": 0.4025387167930603,
"adv/mean_abs_step_conf": 0.7606292963027954,
"adv/ratio_final_to_reasoning": 1.1937706476913603,
"adv/ratio_step_to_reasoning": 1.8895804666009919,
"adv/std_final_conf": 0.7448135614395142,
"adv/std_reasoning": 0.6815594434738159,
"adv/std_step_conf": 0.9320181608200073,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6876629324314802,
"calib/avg_num_step_conf": 5.83984375,
"calib/ece": 0.23704724409448819,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7637795275590551,
"calib/gap": 0.3711963644049886,
"calib/mean_conf": 0.770984251968504,
"calib/mu_c": 0.8922807017543862,
"calib/mu_w": 0.5210843373493976,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16740157480314963,
"calib/std_conf": 0.4104494077074939,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4270010235414535,
"calib/step_q_c_n": 977.0,
"calib/step_q_gap": 0.05418249072292064,
"calib/step_q_w": 0.37281853281853283,
"calib/step_q_w_n": 518.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2715.0,
"completions/max_terminated_length": 2715.0,
"completions/mean_length": 474.296875,
"completions/mean_terminated_length": 476.1568908691406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.051055651158094406,
"kl": 0.07321929931640625,
"learning_rate": 9.166666666666666e-07,
"loss": -0.0208,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03513041511178017,
"mask/share_reasoning": 0.8342991471290588,
"mask/share_step_conf": 0.1266641467809677,
"num_tokens": 40131166.0,
"reward": 0.9845085144042969,
"reward_std": 0.1467195451259613,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/asymmetric_l2_reward": 0.8823486566543579,
"rewards/final_brier_reward_step": 0.7554183006286621,
"rewards/format_reward_step": 0.98828125,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.5689985752105713,
"adv/mean_abs_reasoning": 0.5295567512512207,
"adv/mean_abs_step_conf": 0.7315965294837952,
"adv/ratio_final_to_reasoning": 1.0744808254566836,
"adv/ratio_step_to_reasoning": 1.3815262061246523,
"adv/std_final_conf": 0.7971473336219788,
"adv/std_reasoning": 0.7928540110588074,
"adv/std_step_conf": 0.9335496425628662,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.712508809020437,
"calib/avg_num_step_conf": 6.125,
"calib/ece": 0.2544621513944223,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6892430278884463,
"calib/gap": 0.3601057082452431,
"calib/mean_conf": 0.7120717131474104,
"calib/mu_c": 0.8354545454545454,
"calib/mu_w": 0.4753488372093024,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15458167330677292,
"calib/std_conf": 0.4352474348213658,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4282789651293588,
"calib/step_q_c_n": 889.0,
"calib/step_q_gap": 0.17523036424570637,
"calib/step_q_w": 0.25304860088365244,
"calib/step_q_w_n": 679.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2918.0,
"completions/max_terminated_length": 2918.0,
"completions/mean_length": 566.99609375,
"completions/mean_terminated_length": 569.2196655273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.1792,
"grad_norm": 0.031764764338731766,
"kl": 0.06189727783203125,
"learning_rate": 8.88888888888889e-07,
"loss": 0.0167,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03189847618341446,
"mask/share_reasoning": 0.8458576798439026,
"mask/share_step_conf": 0.11833761632442474,
"num_tokens": 40380989.0,
"reward": 0.9734334945678711,
"reward_std": 0.20216943323612213,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/asymmetric_l2_reward": 0.8927955031394958,
"rewards/final_brier_reward_step": 0.7298526763916016,
"rewards/format_reward_step": 0.9765625,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.5428816080093384,
"adv/mean_abs_reasoning": 0.45098912715911865,
"adv/mean_abs_step_conf": 0.7654911279678345,
"adv/ratio_final_to_reasoning": 1.2037576414071687,
"adv/ratio_step_to_reasoning": 1.6973604946749699,
"adv/std_final_conf": 0.7792088985443115,
"adv/std_reasoning": 0.7014132142066956,
"adv/std_step_conf": 0.9325076341629028,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.709286971830986,
"calib/avg_num_step_conf": 5.203125,
"calib/ece": 0.29669291338582693,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6929133858267716,
"calib/gap": 0.3395787223340041,
"calib/mean_conf": 0.718503937007874,
"calib/mu_c": 0.8682394366197184,
"calib/mu_w": 0.5286607142857143,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2280708661417324,
"calib/std_conf": 0.4331410250972485,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4691484049930652,
"calib/step_q_c_n": 721.0,
"calib/step_q_gap": 0.12826460793905542,
"calib/step_q_w": 0.3408837970540098,
"calib/step_q_w_n": 611.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1644.0,
"completions/max_terminated_length": 1644.0,
"completions/mean_length": 477.62890625,
"completions/mean_terminated_length": 479.5019836425781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.036118652671575546,
"kl": 0.0745849609375,
"learning_rate": 8.611111111111112e-07,
"loss": 0.0798,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03544265776872635,
"mask/share_reasoning": 0.8431687355041504,
"mask/share_step_conf": 0.11748235672712326,
"num_tokens": 40607446.0,
"reward": 0.9495621919631958,
"reward_std": 0.17833659052848816,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/asymmetric_l2_reward": 0.8874062895774841,
"rewards/final_brier_reward_step": 0.7023429870605469,
"rewards/format_reward_step": 0.9921875,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.5604931116104126,
"adv/mean_abs_reasoning": 0.47254762053489685,
"adv/mean_abs_step_conf": 0.7200212478637695,
"adv/ratio_final_to_reasoning": 1.1861092665665451,
"adv/ratio_step_to_reasoning": 1.523700927853042,
"adv/std_final_conf": 0.7950620651245117,
"adv/std_reasoning": 0.7573960423469543,
"adv/std_step_conf": 0.9339142441749573,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7524666666666667,
"calib/avg_num_step_conf": 6.33984375,
"calib/ece": 0.23223999999999984,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.732,
"calib/gap": 0.4186000000000002,
"calib/mean_conf": 0.75776,
"calib/mu_c": 0.9252000000000002,
"calib/mu_w": 0.5066,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.19499999999999984,
"calib/std_conf": 0.4107035212899933,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4403399122807018,
"calib/step_q_c_n": 912.0,
"calib/step_q_gap": 0.1492147364719817,
"calib/step_q_w": 0.2911251758087201,
"calib/step_q_w_n": 711.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3006.0,
"completions/max_terminated_length": 3006.0,
"completions/mean_length": 536.828125,
"completions/mean_terminated_length": 536.828125,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.04905329644680023,
"kl": 0.06963348388671875,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0278,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03186079114675522,
"mask/share_reasoning": 0.8381197452545166,
"mask/share_step_conf": 0.13001945614814758,
"num_tokens": 40849026.0,
"reward": 0.972845196723938,
"reward_std": 0.2081618309020996,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/asymmetric_l2_reward": 0.8838129043579102,
"rewards/final_brier_reward_step": 0.7493773698806763,
"rewards/format_reward_step": 0.9765625,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.6511637568473816,
"adv/mean_abs_reasoning": 0.5285821557044983,
"adv/mean_abs_step_conf": 0.7396926879882812,
"adv/ratio_final_to_reasoning": 1.2319064308546428,
"adv/ratio_step_to_reasoning": 1.3993901988658948,
"adv/std_final_conf": 0.8735871315002441,
"adv/std_reasoning": 0.79267817735672,
"adv/std_step_conf": 0.9336190223693848,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.708744492025073,
"calib/avg_num_step_conf": 5.5546875,
"calib/ece": 0.2965354330708661,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6496062992125984,
"calib/gap": 0.3646049773474835,
"calib/mean_conf": 0.6776377952755905,
"calib/mu_c": 0.8541984732824428,
"calib/mu_w": 0.4895934959349593,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22921259842519678,
"calib/std_conf": 0.4501171443201061,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4538608458390178,
"calib/step_q_c_n": 733.0,
"calib/step_q_gap": 0.15528319707268978,
"calib/step_q_w": 0.29857764876632803,
"calib/step_q_w_n": 689.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1665.0,
"completions/max_terminated_length": 1665.0,
"completions/mean_length": 484.640625,
"completions/mean_terminated_length": 486.54119873046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.1824,
"grad_norm": 0.04068433865904808,
"kl": 0.06620025634765625,
"learning_rate": 8.055555555555557e-07,
"loss": -0.0137,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03494654595851898,
"mask/share_reasoning": 0.8401431441307068,
"mask/share_step_conf": 0.12100405246019363,
"num_tokens": 41079990.0,
"reward": 0.9433212876319885,
"reward_std": 0.1982666552066803,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/asymmetric_l2_reward": 0.8885831832885742,
"rewards/final_brier_reward_step": 0.6980593800544739,
"rewards/format_reward_step": 0.98828125,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.5560652017593384,
"adv/mean_abs_reasoning": 0.41541504859924316,
"adv/mean_abs_step_conf": 0.755997896194458,
"adv/ratio_final_to_reasoning": 1.3385774146467728,
"adv/ratio_step_to_reasoning": 1.819861602856328,
"adv/std_final_conf": 0.7791886925697327,
"adv/std_reasoning": 0.6816006898880005,
"adv/std_step_conf": 0.9320200085639954,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6219101553531798,
"calib/avg_num_step_conf": 5.6953125,
"calib/ece": 0.2877777777777778,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.8412698412698413,
"calib/gap": 0.13524450906864627,
"calib/mean_conf": 0.8542857142857143,
"calib/mu_c": 0.893463687150838,
"calib/mu_w": 0.7582191780821917,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2158730158730159,
"calib/std_conf": 0.3380638818457493,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.42098344693281403,
"calib/step_q_c_n": 1027.0,
"calib/step_q_gap": 0.06116906178200193,
"calib/step_q_w": 0.3598143851508121,
"calib/step_q_w_n": 431.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2329.0,
"completions/max_terminated_length": 2329.0,
"completions/mean_length": 474.8515625,
"completions/mean_terminated_length": 476.7137451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.029070017859339714,
"kl": 0.08162689208984375,
"learning_rate": 7.777777777777779e-07,
"loss": 0.0266,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03529661148786545,
"mask/share_reasoning": 0.8278791904449463,
"mask/share_step_conf": 0.13291800022125244,
"num_tokens": 41304904.0,
"reward": 0.9602549076080322,
"reward_std": 0.18681581318378448,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/asymmetric_l2_reward": 0.8800837993621826,
"rewards/final_brier_reward_step": 0.7037070393562317,
"rewards/format_reward_step": 0.984375,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.5546171069145203,
"adv/mean_abs_reasoning": 0.4558177590370178,
"adv/mean_abs_step_conf": 0.7512186169624329,
"adv/ratio_final_to_reasoning": 1.2167518617226116,
"adv/ratio_step_to_reasoning": 1.6480679000956278,
"adv/std_final_conf": 0.793694019317627,
"adv/std_reasoning": 0.7205135226249695,
"adv/std_step_conf": 0.9342923164367676,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6639427641394277,
"calib/avg_num_step_conf": 5.72265625,
"calib/ece": 0.34315789473684216,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.8987854251012146,
"calib/gap": 0.16451105384511044,
"calib/mean_conf": 0.9097165991902834,
"calib/mu_c": 0.976986301369863,
"calib/mu_w": 0.8124752475247525,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.3308906882591094,
"calib/std_conf": 0.2742769675103127,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.4677382319173364,
"calib/step_q_c_n": 871.0,
"calib/step_q_gap": 0.05031398949309396,
"calib/step_q_w": 0.4174242424242424,
"calib/step_q_w_n": 594.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2478.0,
"completions/max_terminated_length": 2478.0,
"completions/mean_length": 515.125,
"completions/mean_terminated_length": 517.1451416015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.03989225625991821,
"kl": 0.07109832763671875,
"learning_rate": 7.5e-07,
"loss": 0.0486,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.035245005041360855,
"mask/share_reasoning": 0.8358505964279175,
"mask/share_step_conf": 0.12499810010194778,
"num_tokens": 41539936.0,
"reward": 0.8893845081329346,
"reward_std": 0.1888009011745453,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/asymmetric_l2_reward": 0.8339041471481323,
"rewards/final_brier_reward_step": 0.6378335952758789,
"rewards/format_reward_step": 0.96484375,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.6846530437469482,
"adv/mean_abs_reasoning": 0.6147565841674805,
"adv/mean_abs_step_conf": 0.7412719130516052,
"adv/ratio_final_to_reasoning": 1.1136977811699624,
"adv/ratio_step_to_reasoning": 1.2057974361599642,
"adv/std_final_conf": 0.8761252164840698,
"adv/std_reasoning": 0.8266335725784302,
"adv/std_step_conf": 0.9345950484275818,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6985884485884486,
"calib/avg_num_step_conf": 5.8828125,
"calib/ece": 0.3396385542168675,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6224899598393574,
"calib/gap": 0.318115773115773,
"calib/mean_conf": 0.6447791164658635,
"calib/mu_c": 0.8134188034188033,
"calib/mu_w": 0.49530303030303036,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2572690763052209,
"calib/std_conf": 0.46009433663111154,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.40963017751479286,
"calib/step_q_c_n": 676.0,
"calib/step_q_gap": 0.07976270763527482,
"calib/step_q_w": 0.32986746987951804,
"calib/step_q_w_n": 830.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2518.0,
"completions/max_terminated_length": 2518.0,
"completions/mean_length": 582.76171875,
"completions/mean_terminated_length": 582.76171875,
"completions/min_length": 218.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.1856,
"grad_norm": 0.03356742113828659,
"kl": 0.06365203857421875,
"learning_rate": 7.222222222222222e-07,
"loss": -0.0127,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03057742491364479,
"mask/share_reasoning": 0.8542848825454712,
"mask/share_step_conf": 0.11513769626617432,
"num_tokens": 41793355.0,
"reward": 0.8853936195373535,
"reward_std": 0.238206684589386,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/asymmetric_l2_reward": 0.8359798192977905,
"rewards/final_brier_reward_step": 0.6488698720932007,
"rewards/format_reward_step": 0.97265625,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.6600933074951172,
"adv/mean_abs_reasoning": 0.49963003396987915,
"adv/mean_abs_step_conf": 0.7545915842056274,
"adv/ratio_final_to_reasoning": 1.3211641867288382,
"adv/ratio_step_to_reasoning": 1.5103006883111414,
"adv/std_final_conf": 0.8539295792579651,
"adv/std_reasoning": 0.7394165992736816,
"adv/std_step_conf": 0.9340283274650574,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6916584564860426,
"calib/avg_num_step_conf": 6.4453125,
"calib/ece": 0.35928,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.656,
"calib/gap": 0.317983579638752,
"calib/mean_conf": 0.67776,
"calib/mu_c": 0.8621904761904762,
"calib/mu_w": 0.5442068965517242,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30852,
"calib/std_conf": 0.45016061844634964,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4147289156626506,
"calib/step_q_c_n": 664.0,
"calib/step_q_gap": 0.12084149172755931,
"calib/step_q_w": 0.29388742393509126,
"calib/step_q_w_n": 986.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2887.0,
"completions/max_terminated_length": 2887.0,
"completions/mean_length": 553.91015625,
"completions/mean_terminated_length": 558.2716674804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.06960975378751755,
"kl": 0.06383132934570312,
"learning_rate": 6.944444444444446e-07,
"loss": -0.0632,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03212447091937065,
"mask/share_reasoning": 0.834095299243927,
"mask/share_step_conf": 0.12596774101257324,
"num_tokens": 42040980.0,
"reward": 0.8779298067092896,
"reward_std": 0.2234022468328476,
"rewards/accuracy_reward_step": 0.4140625,
"rewards/asymmetric_l2_reward": 0.8505501747131348,
"rewards/final_brier_reward_step": 0.6271843910217285,
"rewards/format_reward_step": 0.9765625,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.5671244859695435,
"adv/mean_abs_reasoning": 0.45925626158714294,
"adv/mean_abs_step_conf": 0.7286878228187561,
"adv/ratio_final_to_reasoning": 1.2348758926217334,
"adv/ratio_step_to_reasoning": 1.5866693255318611,
"adv/std_final_conf": 0.7785314321517944,
"adv/std_reasoning": 0.7206246852874756,
"adv/std_step_conf": 0.9327684640884399,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7994992295839753,
"calib/avg_num_step_conf": 6.03125,
"calib/ece": 0.23900000000000005,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.68,
"calib/gap": 0.47578839239856185,
"calib/mean_conf": 0.70164,
"calib/mu_c": 0.9262121212121212,
"calib/mu_w": 0.4504237288135593,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20632000000000003,
"calib/std_conf": 0.4420272733667008,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4255120101137801,
"calib/step_q_c_n": 791.0,
"calib/step_q_gap": 0.15050537000753839,
"calib/step_q_w": 0.2750066401062417,
"calib/step_q_w_n": 753.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2814.0,
"completions/max_terminated_length": 2814.0,
"completions/mean_length": 526.4375,
"completions/mean_terminated_length": 526.4375,
"completions/min_length": 124.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.07707049697637558,
"kl": 0.0714874267578125,
"learning_rate": 6.666666666666667e-07,
"loss": -0.0188,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.036508336663246155,
"mask/share_reasoning": 0.8322881460189819,
"mask/share_step_conf": 0.13120350241661072,
"num_tokens": 42279812.0,
"reward": 0.9646437764167786,
"reward_std": 0.1994141936302185,
"rewards/accuracy_reward_step": 0.515625,
"rewards/asymmetric_l2_reward": 0.886325478553772,
"rewards/final_brier_reward_step": 0.7445245981216431,
"rewards/format_reward_step": 0.9765625,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.6146119832992554,
"adv/mean_abs_reasoning": 0.497945100069046,
"adv/mean_abs_step_conf": 0.7429401278495789,
"adv/ratio_final_to_reasoning": 1.234296678919086,
"adv/ratio_step_to_reasoning": 1.4920121269323894,
"adv/std_final_conf": 0.8337008953094482,
"adv/std_reasoning": 0.7752436995506287,
"adv/std_step_conf": 0.9334618449211121,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7455472379969024,
"calib/avg_num_step_conf": 6.1015625,
"calib/ece": 0.26134387351778665,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6324110671936759,
"calib/gap": 0.40504710893133716,
"calib/mean_conf": 0.6538339920948617,
"calib/mu_c": 0.8203355704697987,
"calib/mu_w": 0.4152884615384615,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.16312252964426888,
"calib/std_conf": 0.45910542430164514,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4397441860465117,
"calib/step_q_c_n": 860.0,
"calib/step_q_gap": 0.13854048234280797,
"calib/step_q_w": 0.30120370370370375,
"calib/step_q_w_n": 702.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1611.0,
"completions/max_terminated_length": 1611.0,
"completions/mean_length": 505.9140625,
"completions/mean_terminated_length": 507.8980712890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 196.0,
"epoch": 0.1888,
"grad_norm": 0.03708864748477936,
"kl": 0.08526611328125,
"learning_rate": 6.388888888888889e-07,
"loss": -0.016,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.033189140260219574,
"mask/share_reasoning": 0.8366128206253052,
"mask/share_step_conf": 0.12629178166389465,
"num_tokens": 42513158.0,
"reward": 0.9594067335128784,
"reward_std": 0.21088215708732605,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.8830938935279846,
"rewards/final_brier_reward_step": 0.7232195138931274,
"rewards/format_reward_step": 0.98046875,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.6202833652496338,
"adv/mean_abs_reasoning": 0.4997982382774353,
"adv/mean_abs_step_conf": 0.7502584457397461,
"adv/ratio_final_to_reasoning": 1.2410675303447505,
"adv/ratio_step_to_reasoning": 1.501122629654572,
"adv/std_final_conf": 0.8151530623435974,
"adv/std_reasoning": 0.7575740814208984,
"adv/std_step_conf": 0.9331055879592896,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.803671928620453,
"calib/avg_num_step_conf": 5.38671875,
"calib/ece": 0.20469879518072293,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.7228915662650602,
"calib/gap": 0.4577343857240904,
"calib/mean_conf": 0.7482329317269076,
"calib/mu_c": 0.921032258064516,
"calib/mu_w": 0.46329787234042563,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1652208835341366,
"calib/std_conf": 0.41368664047303105,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.46093316519546024,
"calib/step_q_c_n": 793.0,
"calib/step_q_gap": 0.19946558840365136,
"calib/step_q_w": 0.2614675767918089,
"calib/step_q_w_n": 586.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2674.0,
"completions/max_terminated_length": 2674.0,
"completions/mean_length": 474.35546875,
"completions/mean_terminated_length": 478.0905456542969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.04096180945634842,
"kl": 0.08055877685546875,
"learning_rate": 6.111111111111112e-07,
"loss": 0.0305,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.035738177597522736,
"mask/share_reasoning": 0.8296140432357788,
"mask/share_step_conf": 0.12683530151844025,
"num_tokens": 42740665.0,
"reward": 0.9872586727142334,
"reward_std": 0.21035568416118622,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/asymmetric_l2_reward": 0.8873934745788574,
"rewards/final_brier_reward_step": 0.7714987993240356,
"rewards/format_reward_step": 0.97265625,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.5899964570999146,
"adv/mean_abs_reasoning": 0.5133465528488159,
"adv/mean_abs_step_conf": 0.7466897964477539,
"adv/ratio_final_to_reasoning": 1.1493141501111288,
"adv/ratio_step_to_reasoning": 1.454553054469734,
"adv/std_final_conf": 0.7958014011383057,
"adv/std_reasoning": 0.7394075989723206,
"adv/std_step_conf": 0.9330047965049744,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7870469798657718,
"calib/avg_num_step_conf": 5.6953125,
"calib/ece": 0.2223293172690764,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6947791164658634,
"calib/gap": 0.44986040268456373,
"calib/mean_conf": 0.7183935742971888,
"calib/mu_c": 0.8990604026845638,
"calib/mu_w": 0.44920000000000004,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17116465863453825,
"calib/std_conf": 0.42960127880849125,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.42761737089201884,
"calib/step_q_c_n": 852.0,
"calib/step_q_gap": 0.14840945009993967,
"calib/step_q_w": 0.2792079207920792,
"calib/step_q_w_n": 606.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2685.0,
"completions/max_terminated_length": 2685.0,
"completions/mean_length": 505.24609375,
"completions/mean_terminated_length": 509.2243957519531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.0683615505695343,
"kl": 0.069488525390625,
"learning_rate": 5.833333333333334e-07,
"loss": -0.0626,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.034047625958919525,
"mask/share_reasoning": 0.83702552318573,
"mask/share_step_conf": 0.1211143285036087,
"num_tokens": 42976272.0,
"reward": 0.9702746272087097,
"reward_std": 0.20280179381370544,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.878600001335144,
"rewards/final_brier_reward_step": 0.7517929673194885,
"rewards/format_reward_step": 0.96875,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.5583059787750244,
"adv/mean_abs_reasoning": 0.3551320731639862,
"adv/mean_abs_step_conf": 0.7670993804931641,
"adv/ratio_final_to_reasoning": 1.5721080154796956,
"adv/ratio_step_to_reasoning": 2.160039710462725,
"adv/std_final_conf": 0.7800789475440979,
"adv/std_reasoning": 0.6403230428695679,
"adv/std_step_conf": 0.9319993853569031,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.752177759629781,
"calib/avg_num_step_conf": 6.31640625,
"calib/ece": 0.2102788844621513,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.7051792828685259,
"calib/gap": 0.463054988430652,
"calib/mean_conf": 0.7283665338645419,
"calib/mu_c": 0.8999367088607595,
"calib/mu_w": 0.4368817204301075,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1545816733067728,
"calib/std_conf": 0.42847747819257054,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4166522210184182,
"calib/step_q_c_n": 923.0,
"calib/step_q_gap": 0.1318827685688505,
"calib/step_q_w": 0.2847694524495677,
"calib/step_q_w_n": 694.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 572.82421875,
"completions/mean_terminated_length": 577.3346557617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.192,
"grad_norm": 0.04807959124445915,
"kl": 0.065093994140625,
"learning_rate": 5.555555555555555e-07,
"loss": 0.001,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.0307810977101326,
"mask/share_reasoning": 0.8431800603866577,
"mask/share_step_conf": 0.1182263046503067,
"num_tokens": 43226771.0,
"reward": 0.9837304353713989,
"reward_std": 0.17399966716766357,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/asymmetric_l2_reward": 0.8786393404006958,
"rewards/final_brier_reward_step": 0.7700715065002441,
"rewards/format_reward_step": 0.9765625,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.5820561647415161,
"adv/mean_abs_reasoning": 0.41340193152427673,
"adv/mean_abs_step_conf": 0.7540974617004395,
"adv/ratio_final_to_reasoning": 1.4079667276719903,
"adv/ratio_step_to_reasoning": 1.824126604633814,
"adv/std_final_conf": 0.8107714653015137,
"adv/std_reasoning": 0.6816644072532654,
"adv/std_step_conf": 0.933193027973175,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7967366557045283,
"calib/avg_num_step_conf": 5.7421875,
"calib/ece": 0.27083333333333326,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5952380952380952,
"calib/gap": 0.4364672400708322,
"calib/mean_conf": 0.6251984126984127,
"calib/mu_c": 0.8572881355932203,
"calib/mu_w": 0.42082089552238805,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2138888888888888,
"calib/std_conf": 0.46455683814846205,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42562974203338394,
"calib/step_q_c_n": 659.0,
"calib/step_q_gap": 0.15130175189774892,
"calib/step_q_w": 0.274327990135635,
"calib/step_q_w_n": 811.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2933.0,
"completions/max_terminated_length": 2933.0,
"completions/mean_length": 470.2734375,
"completions/mean_terminated_length": 473.97637939453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.045435406267642975,
"kl": 0.08133697509765625,
"learning_rate": 5.277777777777779e-07,
"loss": -0.0131,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03530710190534592,
"mask/share_reasoning": 0.8284176588058472,
"mask/share_step_conf": 0.12846270203590393,
"num_tokens": 43453425.0,
"reward": 0.9462225437164307,
"reward_std": 0.16522014141082764,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/asymmetric_l2_reward": 0.8868392705917358,
"rewards/final_brier_reward_step": 0.7165433168411255,
"rewards/format_reward_step": 0.984375,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.5667697191238403,
"adv/mean_abs_reasoning": 0.4072721302509308,
"adv/mean_abs_step_conf": 0.7169884443283081,
"adv/ratio_final_to_reasoning": 1.3916241181900637,
"adv/ratio_step_to_reasoning": 1.7604652787966444,
"adv/std_final_conf": 0.7674015164375305,
"adv/std_reasoning": 0.6816434860229492,
"adv/std_step_conf": 0.9322676658630371,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7537513208876364,
"calib/avg_num_step_conf": 6.359375,
"calib/ece": 0.2442857142857143,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7420634920634921,
"calib/gap": 0.35981683691440625,
"calib/mean_conf": 0.764920634920635,
"calib/mu_c": 0.8862874251497005,
"calib/mu_w": 0.5264705882352942,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.17325396825396824,
"calib/std_conf": 0.4069012711885218,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4113034623217922,
"calib/step_q_c_n": 982.0,
"calib/step_q_gap": 0.12497219297194695,
"calib/step_q_w": 0.28633126934984526,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2670.0,
"completions/max_terminated_length": 2670.0,
"completions/mean_length": 526.54296875,
"completions/mean_terminated_length": 530.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 215.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.031100839376449585,
"kl": 0.06735992431640625,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0536,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.031918980181217194,
"mask/share_reasoning": 0.8300646543502808,
"mask/share_step_conf": 0.13020385801792145,
"num_tokens": 43694380.0,
"reward": 0.9876556396484375,
"reward_std": 0.1890118420124054,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/asymmetric_l2_reward": 0.8993324041366577,
"rewards/final_brier_reward_step": 0.7494163513183594,
"rewards/format_reward_step": 0.98046875,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.6406201720237732,
"adv/mean_abs_reasoning": 0.4643709659576416,
"adv/mean_abs_step_conf": 0.7473157644271851,
"adv/ratio_final_to_reasoning": 1.37954398312277,
"adv/ratio_step_to_reasoning": 1.6093076854752215,
"adv/std_final_conf": 0.844761073589325,
"adv/std_reasoning": 0.7206538319587708,
"adv/std_step_conf": 0.9331881999969482,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.707851110416015,
"calib/avg_num_step_conf": 5.57421875,
"calib/ece": 0.3112598425196852,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6181102362204725,
"calib/gap": 0.3128683140444166,
"calib/mean_conf": 0.6425196850393701,
"calib/mu_c": 0.7841726618705036,
"calib/mu_w": 0.471304347826087,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20326771653543318,
"calib/std_conf": 0.46144068898940727,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.40705179282868525,
"calib/step_q_c_n": 753.0,
"calib/step_q_gap": 0.12211114000969414,
"calib/step_q_w": 0.2849406528189911,
"calib/step_q_w_n": 674.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2040.0,
"completions/max_terminated_length": 2040.0,
"completions/mean_length": 522.02734375,
"completions/mean_terminated_length": 522.02734375,
"completions/min_length": 202.0,
"completions/min_terminated_length": 202.0,
"epoch": 0.1952,
"grad_norm": 0.035230036824941635,
"kl": 0.067352294921875,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.1027,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03272725269198418,
"mask/share_reasoning": 0.8501238226890564,
"mask/share_step_conf": 0.11714892089366913,
"num_tokens": 43934699.0,
"reward": 0.9388773441314697,
"reward_std": 0.19883155822753906,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/asymmetric_l2_reward": 0.8908123970031738,
"rewards/final_brier_reward_step": 0.6799108982086182,
"rewards/format_reward_step": 0.9921875,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.599378764629364,
"adv/mean_abs_reasoning": 0.5096433162689209,
"adv/mean_abs_step_conf": 0.7571796178817749,
"adv/ratio_final_to_reasoning": 1.1760750028419735,
"adv/ratio_step_to_reasoning": 1.4857049895700738,
"adv/std_final_conf": 0.8223716020584106,
"adv/std_reasoning": 0.757595956325531,
"adv/std_step_conf": 0.9336126446723938,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7833777481678881,
"calib/avg_num_step_conf": 6.16796875,
"calib/ece": 0.18841897233201577,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.691699604743083,
"calib/gap": 0.5367495003331113,
"calib/mean_conf": 0.7044664031620554,
"calib/mu_c": 0.9060126582278482,
"calib/mu_w": 0.36926315789473685,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13418972332015805,
"calib/std_conf": 0.44528026864656384,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4251701427003294,
"calib/step_q_c_n": 911.0,
"calib/step_q_gap": 0.13512523252068864,
"calib/step_q_w": 0.29004491017964074,
"calib/step_q_w_n": 668.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2798.0,
"completions/max_terminated_length": 2798.0,
"completions/mean_length": 522.33203125,
"completions/mean_terminated_length": 522.33203125,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.03629198670387268,
"kl": 0.06891632080078125,
"learning_rate": 4.444444444444445e-07,
"loss": 0.0815,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032990530133247375,
"mask/share_reasoning": 0.843195378780365,
"mask/share_step_conf": 0.12381406873464584,
"num_tokens": 44173696.0,
"reward": 1.0000405311584473,
"reward_std": 0.20342886447906494,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/asymmetric_l2_reward": 0.8806300759315491,
"rewards/final_brier_reward_step": 0.7991386651992798,
"rewards/format_reward_step": 0.984375,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.5196036696434021,
"adv/mean_abs_reasoning": 0.3922334909439087,
"adv/mean_abs_step_conf": 0.7511488199234009,
"adv/ratio_final_to_reasoning": 1.3247305027242255,
"adv/ratio_step_to_reasoning": 1.9150552853499674,
"adv/std_final_conf": 0.7578298449516296,
"adv/std_reasoning": 0.6816517114639282,
"adv/std_step_conf": 0.9322458505630493,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.790268456375839,
"calib/avg_num_step_conf": 6.34765625,
"calib/ece": 0.18509960159362543,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6573705179282868,
"calib/gap": 0.5555230951440977,
"calib/mean_conf": 0.6780079681274901,
"calib/mu_c": 0.9037583892617448,
"calib/mu_w": 0.3482352941176471,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1347410358565736,
"calib/std_conf": 0.45274111572155207,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4383154121863799,
"calib/step_q_c_n": 837.0,
"calib/step_q_gap": 0.1932646507650601,
"calib/step_q_w": 0.24505076142131982,
"calib/step_q_w_n": 788.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2399.0,
"completions/max_terminated_length": 2399.0,
"completions/mean_length": 518.94921875,
"completions/mean_terminated_length": 520.984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.04340367391705513,
"kl": 0.06716156005859375,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.041,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03309101611375809,
"mask/share_reasoning": 0.8382681012153625,
"mask/share_step_conf": 0.12473461031913757,
"num_tokens": 44413467.0,
"reward": 0.9960746765136719,
"reward_std": 0.16638922691345215,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.8855493068695068,
"rewards/final_brier_reward_step": 0.7948812246322632,
"rewards/format_reward_step": 0.9765625,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.5315274000167847,
"adv/mean_abs_reasoning": 0.4281071424484253,
"adv/mean_abs_step_conf": 0.7454137206077576,
"adv/ratio_final_to_reasoning": 1.2415756415015162,
"adv/ratio_step_to_reasoning": 1.7411849667926498,
"adv/std_final_conf": 0.7777411937713623,
"adv/std_reasoning": 0.7204946279525757,
"adv/std_step_conf": 0.9312522411346436,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.765589455372675,
"calib/avg_num_step_conf": 6.1953125,
"calib/ece": 0.22940711462450591,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6245059288537549,
"calib/gap": 0.47976916900843036,
"calib/mean_conf": 0.6343873517786561,
"calib/mu_c": 0.8126415094339623,
"calib/mu_w": 0.33287234042553193,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11766798418972331,
"calib/std_conf": 0.46896248296948856,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41842163355408385,
"calib/step_q_c_n": 906.0,
"calib/step_q_gap": 0.15374516296584861,
"calib/step_q_w": 0.26467647058823524,
"calib/step_q_w_n": 680.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2520.0,
"completions/max_terminated_length": 2520.0,
"completions/mean_length": 516.51171875,
"completions/mean_terminated_length": 520.5787353515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.1984,
"grad_norm": 0.031362757086753845,
"kl": 0.07161712646484375,
"learning_rate": 3.8888888888888895e-07,
"loss": 0.0144,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03271438926458359,
"mask/share_reasoning": 0.8355770707130432,
"mask/share_step_conf": 0.12389606237411499,
"num_tokens": 44650734.0,
"reward": 0.9942600131034851,
"reward_std": 0.15620407462120056,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/asymmetric_l2_reward": 0.9050840139389038,
"rewards/final_brier_reward_step": 0.7615609169006348,
"rewards/format_reward_step": 0.98828125,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.7074639201164246,
"adv/mean_abs_reasoning": 0.5609918832778931,
"adv/mean_abs_step_conf": 0.7377896308898926,
"adv/ratio_final_to_reasoning": 1.2610947523566487,
"adv/ratio_step_to_reasoning": 1.3151520599174533,
"adv/std_final_conf": 0.878374457359314,
"adv/std_reasoning": 0.7928860783576965,
"adv/std_step_conf": 0.9338283538818359,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6919585826665771,
"calib/avg_num_step_conf": 6.58203125,
"calib/ece": 0.326178861788618,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.6260162601626016,
"calib/gap": 0.28398171182680026,
"calib/mean_conf": 0.655040650406504,
"calib/mu_c": 0.7785611510791367,
"calib/mu_w": 0.49457943925233644,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.20808943089430904,
"calib/std_conf": 0.4556517247662998,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.3989940119760479,
"calib/step_q_c_n": 835.0,
"calib/step_q_gap": 0.13965283550545965,
"calib/step_q_w": 0.25934117647058824,
"calib/step_q_w_n": 850.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2988.0,
"completions/max_terminated_length": 2988.0,
"completions/mean_length": 573.05078125,
"completions/mean_terminated_length": 573.05078125,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.047245342284440994,
"kl": 0.06989288330078125,
"learning_rate": 3.611111111111111e-07,
"loss": 0.0154,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03235046565532684,
"mask/share_reasoning": 0.8461363315582275,
"mask/share_step_conf": 0.1215132549405098,
"num_tokens": 44898979.0,
"reward": 0.8949373960494995,
"reward_std": 0.23922453820705414,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/asymmetric_l2_reward": 0.8468563556671143,
"rewards/final_brier_reward_step": 0.6437996029853821,
"rewards/format_reward_step": 0.953125,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.5643256902694702,
"adv/mean_abs_reasoning": 0.47549188137054443,
"adv/mean_abs_step_conf": 0.7558174729347229,
"adv/ratio_final_to_reasoning": 1.186825080257677,
"adv/ratio_step_to_reasoning": 1.5895486390980553,
"adv/std_final_conf": 0.7799937129020691,
"adv/std_reasoning": 0.7574599981307983,
"adv/std_step_conf": 0.9330994486808777,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6968869290509412,
"calib/avg_num_step_conf": 5.8203125,
"calib/ece": 0.2914859437751004,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6987951807228916,
"calib/gap": 0.32033960773989734,
"calib/mean_conf": 0.7150200803212851,
"calib/mu_c": 0.8526760563380282,
"calib/mu_w": 0.5323364485981309,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21811244979919678,
"calib/std_conf": 0.4348300914519625,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.43332065906210393,
"calib/step_q_c_n": 789.0,
"calib/step_q_gap": 0.10563164337023517,
"calib/step_q_w": 0.32768901569186876,
"calib/step_q_w_n": 701.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2533.0,
"completions/max_terminated_length": 2533.0,
"completions/mean_length": 534.078125,
"completions/mean_terminated_length": 536.172607421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.038735099136829376,
"kl": 0.07011795043945312,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0616,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.033644575625658035,
"mask/share_reasoning": 0.8366734981536865,
"mask/share_step_conf": 0.12577570974826813,
"num_tokens": 45139775.0,
"reward": 0.9236536026000977,
"reward_std": 0.19174334406852722,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/asymmetric_l2_reward": 0.8591135144233704,
"rewards/final_brier_reward_step": 0.6827249526977539,
"rewards/format_reward_step": 0.97265625,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.6418702602386475,
"adv/mean_abs_reasoning": 0.47118186950683594,
"adv/mean_abs_step_conf": 0.7607347965240479,
"adv/ratio_final_to_reasoning": 1.3622558544335823,
"adv/ratio_step_to_reasoning": 1.6145247636974518,
"adv/std_final_conf": 0.8555505275726318,
"adv/std_reasoning": 0.7392024993896484,
"adv/std_step_conf": 0.9324488639831543,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7302891744933266,
"calib/avg_num_step_conf": 5.65234375,
"calib/ece": 0.2803529411764706,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5647058823529412,
"calib/gap": 0.35672268907563026,
"calib/mean_conf": 0.6170588235294119,
"calib/mu_c": 0.7835294117647059,
"calib/mu_w": 0.42680672268907566,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18203921568627449,
"calib/std_conf": 0.46179655191796504,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.38990264255910995,
"calib/step_q_c_n": 719.0,
"calib/step_q_gap": 0.1266883568448242,
"calib/step_q_w": 0.26321428571428573,
"calib/step_q_w_n": 728.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1315.0,
"completions/max_terminated_length": 1315.0,
"completions/mean_length": 488.4453125,
"completions/mean_terminated_length": 490.3608093261719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.2016,
"grad_norm": 0.04739164561033249,
"kl": 0.07581329345703125,
"learning_rate": 3.055555555555556e-07,
"loss": -0.0497,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0347851887345314,
"mask/share_reasoning": 0.8378597497940063,
"mask/share_step_conf": 0.12344881892204285,
"num_tokens": 45372585.0,
"reward": 0.9458571672439575,
"reward_std": 0.20587018132209778,
"rewards/accuracy_reward_step": 0.53125,
"rewards/asymmetric_l2_reward": 0.8852865099906921,
"rewards/final_brier_reward_step": 0.7017402648925781,
"rewards/format_reward_step": 0.9921875,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.6198446750640869,
"adv/mean_abs_reasoning": 0.4780905246734619,
"adv/mean_abs_step_conf": 0.7499421834945679,
"adv/ratio_final_to_reasoning": 1.2965006480466095,
"adv/ratio_step_to_reasoning": 1.568619633293887,
"adv/std_final_conf": 0.8346047401428223,
"adv/std_reasoning": 0.7392293810844421,
"adv/std_step_conf": 0.9328544735908508,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7546182266009852,
"calib/avg_num_step_conf": 5.98046875,
"calib/ece": 0.25703124999999993,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.6328125,
"calib/gap": 0.4335714285714287,
"calib/mean_conf": 0.6471093749999999,
"calib/mu_c": 0.8435714285714287,
"calib/mu_w": 0.41000000000000003,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17863281249999996,
"calib/std_conf": 0.4635092912090429,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.39644417475728155,
"calib/step_q_c_n": 824.0,
"calib/step_q_gap": 0.11338901209815849,
"calib/step_q_w": 0.28305516265912306,
"calib/step_q_w_n": 707.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1462.0,
"completions/max_terminated_length": 1462.0,
"completions/mean_length": 529.01953125,
"completions/mean_terminated_length": 531.0941772460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.028230194002389908,
"kl": 0.06874847412109375,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.0593,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.0311330147087574,
"mask/share_reasoning": 0.8457903265953064,
"mask/share_step_conf": 0.1191704124212265,
"num_tokens": 45613622.0,
"reward": 0.9775964021682739,
"reward_std": 0.17072449624538422,
"rewards/accuracy_reward_step": 0.546875,
"rewards/asymmetric_l2_reward": 0.9036279320716858,
"rewards/final_brier_reward_step": 0.742189884185791,
"rewards/format_reward_step": 1.0,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.614532470703125,
"adv/mean_abs_reasoning": 0.43729501962661743,
"adv/mean_abs_step_conf": 0.7394310235977173,
"adv/ratio_final_to_reasoning": 1.405304069613784,
"adv/ratio_step_to_reasoning": 1.6909202950197728,
"adv/std_final_conf": 0.8193501830101013,
"adv/std_reasoning": 0.7205649018287659,
"adv/std_step_conf": 0.9335800409317017,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6994230528032108,
"calib/avg_num_step_conf": 6.80078125,
"calib/ece": 0.3281818181818182,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6284584980237155,
"calib/gap": 0.30981500062711653,
"calib/mean_conf": 0.6575889328063241,
"calib/mu_c": 0.8216806722689076,
"calib/mu_w": 0.5118656716417911,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25770750988142294,
"calib/std_conf": 0.45239821753549814,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42917261055634803,
"calib/step_q_c_n": 701.0,
"calib/step_q_gap": 0.12468222594096345,
"calib/step_q_w": 0.3044903846153846,
"calib/step_q_w_n": 1040.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2682.0,
"completions/max_terminated_length": 2682.0,
"completions/mean_length": 520.515625,
"completions/mean_terminated_length": 520.515625,
"completions/min_length": 140.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.04467106983065605,
"kl": 0.07276153564453125,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.0413,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03575979173183441,
"mask/share_reasoning": 0.82267165184021,
"mask/share_step_conf": 0.14156854152679443,
"num_tokens": 45851042.0,
"reward": 0.9081317186355591,
"reward_std": 0.185234934091568,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/asymmetric_l2_reward": 0.8679160475730896,
"rewards/final_brier_reward_step": 0.6577222943305969,
"rewards/format_reward_step": 0.98828125,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.6072635650634766,
"adv/mean_abs_reasoning": 0.47368013858795166,
"adv/mean_abs_step_conf": 0.7371071577072144,
"adv/ratio_final_to_reasoning": 1.2820118801555398,
"adv/ratio_step_to_reasoning": 1.5561284876848394,
"adv/std_final_conf": 0.7935887575149536,
"adv/std_reasoning": 0.7393070459365845,
"adv/std_step_conf": 0.9304554462432861,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.797986798679868,
"calib/avg_num_step_conf": 5.640625,
"calib/ece": 0.19027888446215144,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5976095617529881,
"calib/gap": 0.5394666666666668,
"calib/mean_conf": 0.6223904382470119,
"calib/mu_c": 0.8394666666666667,
"calib/mu_w": 0.3,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.10752988047808769,
"calib/std_conf": 0.4627484397319022,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.40863046044864226,
"calib/step_q_c_n": 847.0,
"calib/step_q_gap": 0.12956915391597895,
"calib/step_q_w": 0.2790613065326633,
"calib/step_q_w_n": 597.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3016.0,
"completions/max_terminated_length": 3016.0,
"completions/mean_length": 535.84765625,
"completions/mean_terminated_length": 537.9490356445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.2048,
"grad_norm": 0.047735992819070816,
"kl": 0.07106781005859375,
"learning_rate": 2.2222222222222224e-07,
"loss": 0.0159,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03571078181266785,
"mask/share_reasoning": 0.8375634551048279,
"mask/share_step_conf": 0.12281954288482666,
"num_tokens": 46093195.0,
"reward": 0.9984359741210938,
"reward_std": 0.19008705019950867,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/asymmetric_l2_reward": 0.8958485126495361,
"rewards/final_brier_reward_step": 0.7885234355926514,
"rewards/format_reward_step": 0.9765625,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.663608729839325,
"adv/mean_abs_reasoning": 0.5208760499954224,
"adv/mean_abs_step_conf": 0.7138371467590332,
"adv/ratio_final_to_reasoning": 1.274024270928097,
"adv/ratio_step_to_reasoning": 1.3704549225584601,
"adv/std_final_conf": 0.8608205318450928,
"adv/std_reasoning": 0.7394456267356873,
"adv/std_step_conf": 0.9336157441139221,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7956674862117085,
"calib/avg_num_step_conf": 5.92578125,
"calib/ece": 0.22696000000000016,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.592,
"calib/gap": 0.4576955279420558,
"calib/mean_conf": 0.6274400000000001,
"calib/mu_c": 0.8123489932885906,
"calib/mu_w": 0.35465346534653475,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.12920000000000015,
"calib/std_conf": 0.45731547797991706,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.4241108247422681,
"calib/step_q_c_n": 776.0,
"calib/step_q_gap": 0.1588341715708781,
"calib/step_q_w": 0.26527665317139,
"calib/step_q_w_n": 741.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3030.0,
"completions/max_terminated_length": 3030.0,
"completions/mean_length": 552.3515625,
"completions/mean_terminated_length": 552.3515625,
"completions/min_length": 215.0,
"completions/min_terminated_length": 215.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.04746817424893379,
"kl": 0.068115234375,
"learning_rate": 1.9444444444444447e-07,
"loss": -0.004,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.031759485602378845,
"mask/share_reasoning": 0.846390426158905,
"mask/share_step_conf": 0.12185005843639374,
"num_tokens": 46340309.0,
"reward": 0.9673618674278259,
"reward_std": 0.2169458270072937,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.8808885812759399,
"rewards/final_brier_reward_step": 0.7436789274215698,
"rewards/format_reward_step": 0.96875,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.5674813985824585,
"adv/mean_abs_reasoning": 0.3744353652000427,
"adv/mean_abs_step_conf": 0.7580137252807617,
"adv/ratio_final_to_reasoning": 1.5155657059243872,
"adv/ratio_step_to_reasoning": 2.0244180858178065,
"adv/std_final_conf": 0.8004591464996338,
"adv/std_reasoning": 0.6403605937957764,
"adv/std_step_conf": 0.933774471282959,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8198921359588611,
"calib/avg_num_step_conf": 5.44921875,
"calib/ece": 0.21711462450592878,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6205533596837944,
"calib/gap": 0.5331562774363476,
"calib/mean_conf": 0.6388537549407114,
"calib/mu_c": 0.8896268656716418,
"calib/mu_w": 0.35647058823529415,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1631620553359683,
"calib/std_conf": 0.4687978385135658,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.44545193687230994,
"calib/step_q_c_n": 697.0,
"calib/step_q_gap": 0.17214248128491738,
"calib/step_q_w": 0.27330945558739256,
"calib/step_q_w_n": 698.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2128.0,
"completions/max_terminated_length": 2128.0,
"completions/mean_length": 478.26171875,
"completions/mean_terminated_length": 478.26171875,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.03423633426427841,
"kl": 0.06972503662109375,
"learning_rate": 1.6666666666666668e-07,
"loss": -0.0187,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03547438606619835,
"mask/share_reasoning": 0.8433297276496887,
"mask/share_step_conf": 0.12119589745998383,
"num_tokens": 46568688.0,
"reward": 0.9822894334793091,
"reward_std": 0.18843895196914673,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/asymmetric_l2_reward": 0.8912980556488037,
"rewards/final_brier_reward_step": 0.7717183232307434,
"rewards/format_reward_step": 0.984375,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.5949089527130127,
"adv/mean_abs_reasoning": 0.3306322693824768,
"adv/mean_abs_step_conf": 0.7501680850982666,
"adv/ratio_final_to_reasoning": 1.799306987863364,
"adv/ratio_step_to_reasoning": 2.268889502223599,
"adv/std_final_conf": 0.8255235552787781,
"adv/std_reasoning": 0.6185620427131653,
"adv/std_step_conf": 0.9332430362701416,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7805431330611187,
"calib/avg_num_step_conf": 5.890625,
"calib/ece": 0.20566800000000002,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.592,
"calib/gap": 0.5285356795644565,
"calib/mean_conf": 0.623028,
"calib/mu_c": 0.8576978417266186,
"calib/mu_w": 0.32916216216216215,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13634800000000002,
"calib/std_conf": 0.4616225159326611,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4317737789203085,
"calib/step_q_c_n": 778.0,
"calib/step_q_gap": 0.16059569672852764,
"calib/step_q_w": 0.27117808219178086,
"calib/step_q_w_n": 730.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2909.0,
"completions/max_terminated_length": 2909.0,
"completions/mean_length": 512.59765625,
"completions/mean_terminated_length": 516.6338500976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.208,
"grad_norm": 0.04133572801947594,
"kl": 0.06982421875,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.0151,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03313131630420685,
"mask/share_reasoning": 0.8336950540542603,
"mask/share_step_conf": 0.1253610998392105,
"num_tokens": 46805897.0,
"reward": 0.985278844833374,
"reward_std": 0.18139265477657318,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/asymmetric_l2_reward": 0.888818621635437,
"rewards/final_brier_reward_step": 0.7778328061103821,
"rewards/format_reward_step": 0.9765625,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.5143544673919678,
"adv/mean_abs_reasoning": 0.3996366560459137,
"adv/mean_abs_step_conf": 0.7586710453033447,
"adv/ratio_final_to_reasoning": 1.2870552778643867,
"adv/ratio_step_to_reasoning": 1.8984020455225261,
"adv/std_final_conf": 0.7629029750823975,
"adv/std_reasoning": 0.6816370487213135,
"adv/std_step_conf": 0.9331806898117065,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7817218627077782,
"calib/avg_num_step_conf": 5.32421875,
"calib/ece": 0.2220553359683795,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6956521739130435,
"calib/gap": 0.4824356046187033,
"calib/mean_conf": 0.7189723320158102,
"calib/mu_c": 0.9306338028169014,
"calib/mu_w": 0.44819819819819817,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18988142292490126,
"calib/std_conf": 0.43153981494492955,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4720873124147339,
"calib/step_q_c_n": 733.0,
"calib/step_q_gap": 0.14921429654171803,
"calib/step_q_w": 0.32287301587301587,
"calib/step_q_w_n": 630.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2488.0,
"completions/max_terminated_length": 2488.0,
"completions/mean_length": 425.453125,
"completions/mean_terminated_length": 428.80316162109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.03610050305724144,
"kl": 0.08242034912109375,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0574,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.037843845784664154,
"mask/share_reasoning": 0.8252567052841187,
"mask/share_step_conf": 0.12908688187599182,
"num_tokens": 47017357.0,
"reward": 0.9776846170425415,
"reward_std": 0.17640987038612366,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/asymmetric_l2_reward": 0.8756676912307739,
"rewards/final_brier_reward_step": 0.7711077928543091,
"rewards/format_reward_step": 0.98828125,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.5709211826324463,
"adv/mean_abs_reasoning": 0.5281293988227844,
"adv/mean_abs_step_conf": 0.7545597553253174,
"adv/ratio_final_to_reasoning": 1.081025187965385,
"adv/ratio_step_to_reasoning": 1.4287402992661509,
"adv/std_final_conf": 0.7653646469116211,
"adv/std_reasoning": 0.7754925489425659,
"adv/std_step_conf": 0.932712972164154,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8252395752395751,
"calib/avg_num_step_conf": 6.26953125,
"calib/ece": 0.2161441767068273,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6144578313253012,
"calib/gap": 0.5121394522144523,
"calib/mean_conf": 0.6553417670682731,
"calib/mu_c": 0.8959856060606062,
"calib/mu_w": 0.38384615384615384,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1706827309236948,
"calib/std_conf": 0.4506214670081801,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.43755376344086017,
"calib/step_q_c_n": 744.0,
"calib/step_q_gap": 0.16742600502041882,
"calib/step_q_w": 0.27012775842044134,
"calib/step_q_w_n": 861.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2589.0,
"completions/max_terminated_length": 2589.0,
"completions/mean_length": 550.2890625,
"completions/mean_terminated_length": 550.2890625,
"completions/min_length": 182.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.04556097462773323,
"kl": 0.06296539306640625,
"learning_rate": 8.333333333333334e-08,
"loss": 0.0593,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03290029242634773,
"mask/share_reasoning": 0.8371220231056213,
"mask/share_step_conf": 0.12997770309448242,
"num_tokens": 47263287.0,
"reward": 0.9729477167129517,
"reward_std": 0.1934598684310913,
"rewards/accuracy_reward_step": 0.515625,
"rewards/asymmetric_l2_reward": 0.8871469497680664,
"rewards/final_brier_reward_step": 0.7618734240531921,
"rewards/format_reward_step": 0.96875,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.6123339533805847,
"adv/mean_abs_reasoning": 0.5126949548721313,
"adv/mean_abs_step_conf": 0.7387211918830872,
"adv/ratio_final_to_reasoning": 1.1943436297969887,
"adv/ratio_step_to_reasoning": 1.440859101231702,
"adv/std_final_conf": 0.8177485466003418,
"adv/std_reasoning": 0.7576181292533875,
"adv/std_step_conf": 0.931955099105835,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8303197064989518,
"calib/avg_num_step_conf": 5.9921875,
"calib/ece": 0.16856000000000004,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.524,
"calib/gap": 0.6000628930817611,
"calib/mean_conf": 0.5522400000000001,
"calib/mu_c": 0.8066666666666668,
"calib/mu_w": 0.20660377358490564,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07240000000000002,
"calib/std_conf": 0.47586529858774107,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41482986111111114,
"calib/step_q_c_n": 864.0,
"calib/step_q_gap": 0.10320299543946931,
"calib/step_q_w": 0.31162686567164183,
"calib/step_q_w_n": 670.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2589.0,
"completions/max_terminated_length": 2589.0,
"completions/mean_length": 475.43359375,
"completions/mean_terminated_length": 481.0711669921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.2112,
"grad_norm": 0.03035557083785534,
"kl": 0.07807159423828125,
"learning_rate": 5.555555555555556e-08,
"loss": 0.0208,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03742978721857071,
"mask/share_reasoning": 0.8121263384819031,
"mask/share_step_conf": 0.13872510194778442,
"num_tokens": 47490382.0,
"reward": 1.0016413927078247,
"reward_std": 0.15997019410133362,
"rewards/accuracy_reward_step": 0.5625,
"rewards/asymmetric_l2_reward": 0.8928694128990173,
"rewards/final_brier_reward_step": 0.8026007413864136,
"rewards/format_reward_step": 0.9765625,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.6816428899765015,
"adv/mean_abs_reasoning": 0.537695050239563,
"adv/mean_abs_step_conf": 0.7122054100036621,
"adv/ratio_final_to_reasoning": 1.267712785663183,
"adv/ratio_step_to_reasoning": 1.3245526617482313,
"adv/std_final_conf": 0.8624335527420044,
"adv/std_reasoning": 0.7929316163063049,
"adv/std_step_conf": 0.9336603283882141,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7438086548488009,
"calib/avg_num_step_conf": 5.76171875,
"calib/ece": 0.27726907630522085,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.642570281124498,
"calib/gap": 0.3785968456725758,
"calib/mean_conf": 0.6783935742971888,
"calib/mu_c": 0.8486861313868614,
"calib/mu_w": 0.47008928571428565,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2027309236947791,
"calib/std_conf": 0.44654118043262725,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.425260347129506,
"calib/step_q_c_n": 749.0,
"calib/step_q_gap": 0.12844216531132419,
"calib/step_q_w": 0.2968181818181818,
"calib/step_q_w_n": 726.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2184.0,
"completions/max_terminated_length": 2184.0,
"completions/mean_length": 547.49609375,
"completions/mean_terminated_length": 551.8070678710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.29570773243904114,
"kl": 1.4525260925292969,
"learning_rate": 2.777777777777778e-08,
"loss": -0.0941,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.0359857976436615,
"mask/share_reasoning": 0.8317693471908569,
"mask/share_step_conf": 0.12443234026432037,
"num_tokens": 47734741.0,
"reward": 0.940308690071106,
"reward_std": 0.24737019836902618,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/asymmetric_l2_reward": 0.877549409866333,
"rewards/final_brier_reward_step": 0.7022866606712341,
"rewards/format_reward_step": 0.96875,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.5579714775085449,
"adv/mean_abs_reasoning": 0.4412845969200134,
"adv/mean_abs_step_conf": 0.7684429883956909,
"adv/ratio_final_to_reasoning": 1.2644254555970418,
"adv/ratio_step_to_reasoning": 1.7413773192155577,
"adv/std_final_conf": 0.7886030077934265,
"adv/std_reasoning": 0.7204948663711548,
"adv/std_step_conf": 0.9336580634117126,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7859723058398554,
"calib/avg_num_step_conf": 5.30859375,
"calib/ece": 0.2114399999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.64,
"calib/gap": 0.52675630476955,
"calib/mean_conf": 0.6500800000000001,
"calib/mu_c": 0.8586754966887419,
"calib/mu_w": 0.3319191919191919,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.12875999999999993,
"calib/std_conf": 0.46648128965693786,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4678467635402906,
"calib/step_q_c_n": 757.0,
"calib/step_q_gap": 0.20974045124793178,
"calib/step_q_w": 0.2581063122923588,
"calib/step_q_w_n": 602.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1912.0,
"completions/max_terminated_length": 1912.0,
"completions/mean_length": 496.10546875,
"completions/mean_terminated_length": 500.0118103027344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.029257260262966156,
"kl": 0.0724029541015625,
"learning_rate": 0.0,
"loss": -0.0295,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03522798418998718,
"mask/share_reasoning": 0.8422371745109558,
"mask/share_step_conf": 0.1147223487496376,
"num_tokens": 47969792.0,
"reward": 0.9783412218093872,
"reward_std": 0.192615807056427,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/asymmetric_l2_reward": 0.8736051321029663,
"rewards/final_brier_reward_step": 0.7705773711204529,
"rewards/format_reward_step": 0.97265625,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.003022296619601548,
"train_runtime": 14229.7566,
"train_samples_per_second": 3.598,
"train_steps_per_second": 0.014
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 47969792,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}