{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.47714588046073914, "adv/mean_abs_step_conf": 0.7489925622940063, "adv/ratio_final_to_reasoning": 1.622059942565935, "adv/ratio_step_to_reasoning": 1.5697349447317201, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.7393431663513184, "adv/std_step_conf": 0.9343287348747253, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.04304001107811928, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": -0.0135, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 0.8933797478675842, "reward_std": 0.19672280550003052, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7420106530189514, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.5104547739028931, "adv/mean_abs_step_conf": 0.7698422074317932, "adv/ratio_final_to_reasoning": 1.503115479781084, "adv/ratio_step_to_reasoning": 1.5081496868873343, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.7575037479400635, "adv/std_step_conf": 0.9345327615737915, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.04044223949313164, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0157, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 0.833743691444397, "reward_std": 0.19285300374031067, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7291612029075623, "step": 2 }, { "adv/mean_abs_final_conf": 0.7681164145469666, "adv/mean_abs_reasoning": 0.480376660823822, "adv/mean_abs_step_conf": 0.7542245388031006, "adv/ratio_final_to_reasoning": 1.598987788519295, "adv/ratio_step_to_reasoning": 1.5700690735258518, "adv/std_final_conf": 0.9304441809654236, "adv/std_reasoning": 0.7392795085906982, "adv/std_step_conf": 0.9335688948631287, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.49758064516129036, "calib/avg_num_step_conf": 4.91796875, "calib/ece": 0.2540316205533596, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3557312252964427, "calib/gap": 0.001176075268817356, "calib/mean_conf": 0.8864426877470355, "calib/mu_c": 0.8868750000000001, "calib/mu_w": 0.8856989247311827, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2540316205533596, "calib/std_conf": 0.04630191430886356, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8032205683355886, "calib/step_q_c_n": 739.0, "calib/step_q_gap": 0.05218210679712709, "calib/step_q_w": 0.7510384615384615, "calib/step_q_w_n": 520.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2141.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 498.859375, "completions/mean_terminated_length": 500.8157043457031, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0032, "grad_norm": 0.06902016699314117, "kl": 0.0011424124240875244, "learning_rate": 7.5e-07, "loss": 0.0011, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03277520090341568, "mask/share_reasoning": 0.8543053865432739, "mask/share_step_conf": 0.10901317000389099, "num_tokens": 691625.0, "reward": 0.8797547817230225, "reward_std": 0.19141316413879395, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6923027038574219, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7445505857467651, "step": 3 }, { "adv/mean_abs_final_conf": 0.759086549282074, "adv/mean_abs_reasoning": 0.3866489827632904, "adv/mean_abs_step_conf": 0.7315422296524048, "adv/ratio_final_to_reasoning": 1.9632446563212422, "adv/ratio_step_to_reasoning": 1.8920060888929346, "adv/std_final_conf": 0.9280747771263123, "adv/std_reasoning": 0.6815574169158936, "adv/std_step_conf": 0.9344615340232849, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5238012749445676, "calib/avg_num_step_conf": 5.328125, "calib/ece": 0.23059523809523813, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.28174603174603174, "calib/gap": 0.006848669623060077, "calib/mean_conf": 0.8813888888888889, "calib/mu_c": 0.8837804878048782, "calib/mu_w": 0.8769318181818181, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.23059523809523813, "calib/std_conf": 0.04538258392998638, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8031084337349397, "calib/step_q_c_n": 830.0, "calib/step_q_gap": 0.02587996931546399, "calib/step_q_w": 0.7772284644194757, "calib/step_q_w_n": 534.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2736.0, "completions/max_terminated_length": 2736.0, "completions/mean_length": 503.65234375, "completions/mean_terminated_length": 505.6274719238281, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.004266666666666667, "grad_norm": 0.06372291594743729, "kl": 0.00030165910720825195, "learning_rate": 1.0000000000000002e-06, "loss": 0.0234, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03290240466594696, "mask/share_reasoning": 0.8445039987564087, "mask/share_step_conf": 0.11868731677532196, "num_tokens": 926728.0, "reward": 0.8805626630783081, "reward_std": 0.16167013347148895, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7082082033157349, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7286983132362366, "step": 4 }, { "adv/mean_abs_final_conf": 0.7936871647834778, "adv/mean_abs_reasoning": 0.4292134642601013, "adv/mean_abs_step_conf": 0.7634122967720032, "adv/ratio_final_to_reasoning": 1.8491665124058343, "adv/ratio_step_to_reasoning": 1.7786308220503049, "adv/std_final_conf": 0.9305525422096252, "adv/std_reasoning": 0.6816219091415405, "adv/std_step_conf": 0.9333208203315735, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.49735567732891545, "calib/avg_num_step_conf": 4.73046875, "calib/ece": 0.35171314741035864, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.2788844621513944, "calib/gap": -3.12221231045795e-05, "calib/mean_conf": 0.8815936254980079, "calib/mu_c": 0.881578947368421, "calib/mu_w": 0.8816101694915256, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.35171314741035864, "calib/std_conf": 0.04364665025964348, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.797311669128508, "calib/step_q_c_n": 677.0, "calib/step_q_gap": 0.0173303957202684, "calib/step_q_w": 0.7799812734082396, "calib/step_q_w_n": 534.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2453.0, "completions/max_terminated_length": 2453.0, "completions/mean_length": 510.68359375, "completions/mean_terminated_length": 512.686279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.005333333333333333, "grad_norm": 0.04050876572728157, "kl": 0.00031438469886779785, "learning_rate": 1.25e-06, "loss": -0.0514, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03355187550187111, "mask/share_reasoning": 0.8516452312469482, "mask/share_step_conf": 0.11089661717414856, "num_tokens": 1164151.0, "reward": 0.795979380607605, "reward_std": 0.16130538284778595, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6110925674438477, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6824285984039307, "step": 5 }, { "adv/mean_abs_final_conf": 0.7439464926719666, "adv/mean_abs_reasoning": 0.3911336064338684, "adv/mean_abs_step_conf": 0.7478975057601929, "adv/ratio_final_to_reasoning": 1.9020265209498193, "adv/ratio_step_to_reasoning": 1.9121279620513634, "adv/std_final_conf": 0.9310183525085449, "adv/std_reasoning": 0.7013688683509827, "adv/std_step_conf": 0.9341275095939636, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4631035825545171, "calib/avg_num_step_conf": 5.296875, "calib/ece": 0.3064143426294821, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.2589641434262948, "calib/gap": -0.003848650051921032, "calib/mean_conf": 0.8784462151394421, "calib/mu_c": 0.8768055555555555, "calib/mu_w": 0.8806542056074765, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.30557768924302786, "calib/std_conf": 0.039210029202454075, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7956066945606696, "calib/step_q_c_n": 717.0, "calib/step_q_gap": 0.00964425324611562, "calib/step_q_w": 0.785962441314554, "calib/step_q_w_n": 639.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2596.0, "completions/max_terminated_length": 2596.0, "completions/mean_length": 443.25390625, "completions/mean_terminated_length": 446.74407958984375, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.0064, "grad_norm": 0.04645095393061638, "kl": 0.0004774928092956543, "learning_rate": 1.5e-06, "loss": -0.0451, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0370139479637146, "mask/share_reasoning": 0.8299905061721802, "mask/share_step_conf": 0.12518304586410522, "num_tokens": 1383576.0, "reward": 0.8166599273681641, "reward_std": 0.18238767981529236, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6414105892181396, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.6856592297554016, "step": 6 }, { "adv/mean_abs_final_conf": 0.7877150774002075, "adv/mean_abs_reasoning": 0.4653007388114929, "adv/mean_abs_step_conf": 0.7472065091133118, "adv/ratio_final_to_reasoning": 1.692916025476878, "adv/ratio_step_to_reasoning": 1.6058571302119236, "adv/std_final_conf": 0.9305210709571838, "adv/std_reasoning": 0.7206236720085144, "adv/std_step_conf": 0.9340626001358032, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5086842105263157, "calib/avg_num_step_conf": 5.67578125, "calib/ece": 0.26050980392156864, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3215686274509804, "calib/gap": 0.0001052631578947194, "calib/mean_conf": 0.8839607843137255, "calib/mu_c": 0.884, "calib/mu_w": 0.8838947368421053, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25850980392156864, "calib/std_conf": 0.04581718180338529, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7908867521367521, "calib/step_q_c_n": 936.0, "calib/step_q_gap": -6.1023491874534486e-05, "calib/step_q_w": 0.7909477756286266, "calib/step_q_w_n": 517.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1445.0, "completions/max_terminated_length": 1445.0, "completions/mean_length": 541.5078125, "completions/mean_terminated_length": 543.6314086914062, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.007466666666666667, "grad_norm": 0.05814244598150253, "kl": 0.0003116726875305176, "learning_rate": 1.75e-06, "loss": 0.0009, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.030131680890917778, "mask/share_reasoning": 0.8535774946212769, "mask/share_step_conf": 0.11238458752632141, "num_tokens": 1629626.0, "reward": 0.8802685737609863, "reward_std": 0.18626058101654053, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.695068359375, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7420310974121094, "step": 7 }, { "adv/mean_abs_final_conf": 0.7732793092727661, "adv/mean_abs_reasoning": 0.37526705861091614, "adv/mean_abs_step_conf": 0.7797518372535706, "adv/ratio_final_to_reasoning": 2.0606106811909553, "adv/ratio_step_to_reasoning": 2.0778584726831344, "adv/std_final_conf": 0.9287807941436768, "adv/std_reasoning": 0.6402899026870728, "adv/std_step_conf": 0.9342546463012695, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4633838383838384, "calib/avg_num_step_conf": 4.734375, "calib/ece": 0.3051181102362205, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.24015748031496062, "calib/gap": 0.0007247474747473781, "calib/mean_conf": 0.8720472440944882, "calib/mu_c": 0.8723611111111111, "calib/mu_w": 0.8716363636363638, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3051181102362205, "calib/std_conf": 0.05314278171408132, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.794279210925645, "calib/step_q_c_n": 659.0, "calib/step_q_gap": 0.023139970419315925, "calib/step_q_w": 0.7711392405063291, "calib/step_q_w_n": 553.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 505.140625, "completions/mean_terminated_length": 507.1216125488281, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.008533333333333334, "grad_norm": 0.03701084107160568, "kl": 0.0010253936052322388, "learning_rate": 2.0000000000000003e-06, "loss": -0.0653, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033262234181165695, "mask/share_reasoning": 0.8576483726501465, "mask/share_step_conf": 0.10518313944339752, "num_tokens": 1865454.0, "reward": 0.8495633602142334, "reward_std": 0.16637495160102844, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6513617038726807, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7376086115837097, "step": 8 }, { "adv/mean_abs_final_conf": 0.7699242234230042, "adv/mean_abs_reasoning": 0.42989248037338257, "adv/mean_abs_step_conf": 0.7657554745674133, "adv/ratio_final_to_reasoning": 1.7909692738851526, "adv/ratio_step_to_reasoning": 1.781272084364717, "adv/std_final_conf": 0.9297853112220764, "adv/std_reasoning": 0.7014699578285217, "adv/std_step_conf": 0.934877336025238, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.45016307893020213, "calib/avg_num_step_conf": 5.19140625, "calib/ece": 0.30529880478087645, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3147410358565737, "calib/gap": -0.006487279843444216, "calib/mean_conf": 0.8790836653386455, "calib/mu_c": 0.8763698630136987, "calib/mu_w": 0.8828571428571429, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.30135458167330675, "calib/std_conf": 0.04754068264220872, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7566895604395605, "calib/step_q_c_n": 728.0, "calib/step_q_gap": 0.022546465597630294, "calib/step_q_w": 0.7341430948419302, "calib/step_q_w_n": 601.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 505.69921875, "completions/mean_terminated_length": 507.682373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.0096, "grad_norm": 0.0427432544529438, "kl": 0.00036010146141052246, "learning_rate": 2.25e-06, "loss": 0.1186, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034473203122615814, "mask/share_reasoning": 0.8522090315818787, "mask/share_step_conf": 0.10941154509782791, "num_tokens": 2102449.0, "reward": 0.8128198385238647, "reward_std": 0.20088228583335876, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6406656503677368, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6787240505218506, "step": 9 }, { "adv/mean_abs_final_conf": 0.7671608924865723, "adv/mean_abs_reasoning": 0.4218064248561859, "adv/mean_abs_step_conf": 0.7739673256874084, "adv/ratio_final_to_reasoning": 1.8187510840977217, "adv/ratio_step_to_reasoning": 1.8348874746307886, "adv/std_final_conf": 0.9295661449432373, "adv/std_reasoning": 0.7012843489646912, "adv/std_step_conf": 0.9343947172164917, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.49496732026143797, "calib/avg_num_step_conf": 5.0546875, "calib/ece": 0.2822529644268775, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.33201581027667987, "calib/gap": -0.0011640522875817627, "calib/mean_conf": 0.8869960474308299, "calib/mu_c": 0.8865359477124184, "calib/mu_w": 0.8877000000000002, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2822529644268775, "calib/std_conf": 0.040798211425544656, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7859644322845417, "calib/step_q_c_n": 731.0, "calib/step_q_gap": -0.007844430948140357, "calib/step_q_w": 0.7938088632326821, "calib/step_q_w_n": 563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 529.0859375, "completions/mean_terminated_length": 529.0859375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.010666666666666666, "grad_norm": 0.04322522133588791, "kl": 0.00040537118911743164, "learning_rate": 2.5e-06, "loss": 0.0367, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03210898116230965, "mask/share_reasoning": 0.8572521209716797, "mask/share_step_conf": 0.11063890159130096, "num_tokens": 2344695.0, "reward": 0.846677303314209, "reward_std": 0.17992925643920898, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6672956943511963, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7096526622772217, "step": 10 }, { "adv/mean_abs_final_conf": 0.7750564217567444, "adv/mean_abs_reasoning": 0.4120665192604065, "adv/mean_abs_step_conf": 0.7840834259986877, "adv/ratio_final_to_reasoning": 1.8809012271801329, "adv/ratio_step_to_reasoning": 1.902807894720474, "adv/std_final_conf": 0.9275817275047302, "adv/std_reasoning": 0.6816621422767639, "adv/std_step_conf": 0.9336605668067932, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5005824111822947, "calib/avg_num_step_conf": 5.3125, "calib/ece": 0.2981102362204725, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.421259842519685, "calib/gap": -0.007611467029055796, "calib/mean_conf": 0.8851181102362206, "calib/mu_c": 0.8820915032679739, "calib/mu_w": 0.8897029702970297, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2904330708661418, "calib/std_conf": 0.07794472240478827, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7586185819070905, "calib/step_q_c_n": 818.0, "calib/step_q_gap": -0.025366657945308124, "calib/step_q_w": 0.7839852398523987, "calib/step_q_w_n": 542.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 510.859375, "completions/mean_terminated_length": 510.859375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.011733333333333333, "grad_norm": 0.048428092151880264, "kl": 0.000606834888458252, "learning_rate": 2.7500000000000004e-06, "loss": 0.0033, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03325919806957245, "mask/share_reasoning": 0.8499425649642944, "mask/share_step_conf": 0.11679823696613312, "num_tokens": 2579955.0, "reward": 0.8530465364456177, "reward_std": 0.16520269215106964, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6653339862823486, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7235714793205261, "step": 11 }, { "adv/mean_abs_final_conf": 0.779029905796051, "adv/mean_abs_reasoning": 0.4382178783416748, "adv/mean_abs_step_conf": 0.7512285113334656, "adv/ratio_final_to_reasoning": 1.777722781973418, "adv/ratio_step_to_reasoning": 1.7142808371404212, "adv/std_final_conf": 0.9251027703285217, "adv/std_reasoning": 0.7014181017875671, "adv/std_step_conf": 0.9336206912994385, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5273951400711965, "calib/avg_num_step_conf": 5.3046875, "calib/ece": 0.18339920948616595, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4150197628458498, "calib/gap": 0.00214517876489706, "calib/mean_conf": 0.8947826086956522, "calib/mu_c": 0.8953846153846152, "calib/mu_w": 0.8932394366197182, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17940711462450587, "calib/std_conf": 0.045688109327331215, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.776427818756586, "calib/step_q_c_n": 949.0, "calib/step_q_gap": 0.0118067918617204, "calib/step_q_w": 0.7646210268948656, "calib/step_q_w_n": 409.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2545.0, "completions/max_terminated_length": 2545.0, "completions/mean_length": 483.40625, "completions/mean_terminated_length": 483.40625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.0128, "grad_norm": 0.05811633542180061, "kl": 0.0011870861053466797, "learning_rate": 3e-06, "loss": -0.0039, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03670063614845276, "mask/share_reasoning": 0.8373144865036011, "mask/share_step_conf": 0.12598487734794617, "num_tokens": 2807883.0, "reward": 0.9387664794921875, "reward_std": 0.17711131274700165, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7571523189544678, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7805367708206177, "step": 12 }, { "adv/mean_abs_final_conf": 0.7828412055969238, "adv/mean_abs_reasoning": 0.46385371685028076, "adv/mean_abs_step_conf": 0.7666856646537781, "adv/ratio_final_to_reasoning": 1.687689840910779, "adv/ratio_step_to_reasoning": 1.6528608843749832, "adv/std_final_conf": 0.9281428456306458, "adv/std_reasoning": 0.7205371856689453, "adv/std_step_conf": 0.9340802431106567, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5346534653465347, "calib/avg_num_step_conf": 4.71875, "calib/ece": 0.291171875, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.013747684445863939, "calib/mean_conf": 0.8966406249999999, "calib/mu_c": 0.9020645161290324, "calib/mu_w": 0.8883168316831684, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.291171875, "calib/std_conf": 0.05633711786743599, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7597331460674157, "calib/step_q_c_n": 712.0, "calib/step_q_gap": 0.009954920260964117, "calib/step_q_w": 0.7497782258064516, "calib/step_q_w_n": 496.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1816.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 467.59375, "completions/mean_terminated_length": 469.427490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.013866666666666666, "grad_norm": 0.045925572514534, "kl": 0.0027227401733398438, "learning_rate": 3.2500000000000002e-06, "loss": 0.0038, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.035495657473802567, "mask/share_reasoning": 0.8489422798156738, "mask/share_step_conf": 0.11165584623813629, "num_tokens": 3032179.0, "reward": 0.8815797567367554, "reward_std": 0.1894691288471222, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6747804880142212, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7688478231430054, "step": 13 }, { "adv/mean_abs_final_conf": 0.7548052072525024, "adv/mean_abs_reasoning": 0.5050555467605591, "adv/mean_abs_step_conf": 0.7736717462539673, "adv/ratio_final_to_reasoning": 1.4944993913913924, "adv/ratio_step_to_reasoning": 1.5318547657110595, "adv/std_final_conf": 0.9282140731811523, "adv/std_reasoning": 0.7575881481170654, "adv/std_step_conf": 0.9345173835754395, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.44230898011116326, "calib/avg_num_step_conf": 5.859375, "calib/ece": 0.3535365853658536, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5853658536585366, "calib/gap": -0.007598607111765965, "calib/mean_conf": 0.9104471544715447, "calib/mu_c": 0.9070802919708029, "calib/mu_w": 0.9146788990825688, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3535365853658536, "calib/std_conf": 0.03879541990653156, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.753515923566879, "calib/step_q_c_n": 785.0, "calib/step_q_gap": 0.020145294196249663, "calib/step_q_w": 0.7333706293706294, "calib/step_q_w_n": 715.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3013.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 567.05078125, "completions/mean_terminated_length": 569.2745361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.014933333333333333, "grad_norm": 0.039881713688373566, "kl": 0.003141164779663086, "learning_rate": 3.5e-06, "loss": 0.0357, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.031403157860040665, "mask/share_reasoning": 0.8467769622802734, "mask/share_step_conf": 0.1179136335849762, "num_tokens": 3282744.0, "reward": 0.8136132955551147, "reward_std": 0.19758296012878418, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5947633385658264, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7340257167816162, "step": 14 }, { "adv/mean_abs_final_conf": 0.7607070207595825, "adv/mean_abs_reasoning": 0.31956201791763306, "adv/mean_abs_step_conf": 0.7712626457214355, "adv/ratio_final_to_reasoning": 2.3804675715737105, "adv/ratio_step_to_reasoning": 2.4134991096477183, "adv/std_final_conf": 0.9202884435653687, "adv/std_reasoning": 0.5960820317268372, "adv/std_step_conf": 0.9338883757591248, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.49731732243229426, "calib/avg_num_step_conf": 5.02734375, "calib/ece": 0.32792156862745103, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7098039215686275, "calib/gap": -0.0028570516096064758, "calib/mean_conf": 0.9185882352941177, "calib/mu_c": 0.9174342105263159, "calib/mu_w": 0.9202912621359224, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32521568627450986, "calib/std_conf": 0.051032160744173334, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.719733688415446, "calib/step_q_c_n": 751.0, "calib/step_q_gap": -0.007131983226345007, "calib/step_q_w": 0.7268656716417911, "calib/step_q_w_n": 536.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2654.0, "completions/max_terminated_length": 2654.0, "completions/mean_length": 461.67578125, "completions/mean_terminated_length": 461.67578125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.016, "grad_norm": 0.038044411689043045, "kl": 0.006200313568115234, "learning_rate": 3.7500000000000005e-06, "loss": 0.0106, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.035136036574840546, "mask/share_reasoning": 0.846064567565918, "mask/share_step_conf": 0.1187993660569191, "num_tokens": 3508813.0, "reward": 0.8648393154144287, "reward_std": 0.14193323254585266, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6486945152282715, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7630153894424438, "step": 15 }, { "adv/mean_abs_final_conf": 0.7559776306152344, "adv/mean_abs_reasoning": 0.42459970712661743, "adv/mean_abs_step_conf": 0.7681550979614258, "adv/ratio_final_to_reasoning": 1.7804478381088442, "adv/ratio_step_to_reasoning": 1.8091277150418728, "adv/std_final_conf": 0.9152006506919861, "adv/std_reasoning": 0.6817044019699097, "adv/std_step_conf": 0.9341446161270142, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5220464725643897, "calib/avg_num_step_conf": 6.6015625, "calib/ece": 0.31845528455284555, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.8577235772357723, "calib/gap": 0.013194288913773833, "calib/mean_conf": 0.9321951219512195, "calib/mu_c": 0.937236842105263, "calib/mu_w": 0.9240425531914892, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.31638211382113823, "calib/std_conf": 0.0715625439722616, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6638545816733068, "calib/step_q_c_n": 1004.0, "calib/step_q_gap": 0.06821318225639716, "calib/step_q_w": 0.5956413994169096, "calib/step_q_w_n": 686.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3071.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 673.078125, "completions/mean_terminated_length": 675.7177124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.017066666666666667, "grad_norm": 0.036168936640024185, "kl": 0.006927013397216797, "learning_rate": 4.000000000000001e-06, "loss": 0.0749, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.025286775082349777, "mask/share_reasoning": 0.8620797395706177, "mask/share_step_conf": 0.10872718691825867, "num_tokens": 3789969.0, "reward": 0.8630998134613037, "reward_std": 0.18709713220596313, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6401921510696411, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7750698328018188, "step": 16 }, { "adv/mean_abs_final_conf": 0.7265196442604065, "adv/mean_abs_reasoning": 0.5270153284072876, "adv/mean_abs_step_conf": 0.7634360790252686, "adv/ratio_final_to_reasoning": 1.3785550535238666, "adv/ratio_step_to_reasoning": 1.4486031769369532, "adv/std_final_conf": 0.922140896320343, "adv/std_reasoning": 0.7927994132041931, "adv/std_step_conf": 0.9348483681678772, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5486486486486486, "calib/avg_num_step_conf": 5.625, "calib/ece": 0.19995983935742973, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8714859437751004, "calib/gap": 0.006579391891891939, "calib/mean_conf": 0.9402008032128514, "calib/mu_c": 0.9418918918918919, "calib/mu_w": 0.9353125, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.19859437751004017, "calib/std_conf": 0.04102550679991887, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.6443323727185398, "calib/step_q_c_n": 1041.0, "calib/step_q_gap": 0.01593638274360243, "calib/step_q_w": 0.6283959899749374, "calib/step_q_w_n": 399.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2665.0, "completions/max_terminated_length": 2665.0, "completions/mean_length": 530.74609375, "completions/mean_terminated_length": 537.03955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.018133333333333335, "grad_norm": 0.03977389633655548, "kl": 0.01251983642578125, "learning_rate": 4.25e-06, "loss": 0.013, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03260646015405655, "mask/share_reasoning": 0.8350520133972168, "mask/share_step_conf": 0.12062276899814606, "num_tokens": 4029368.0, "reward": 0.945256233215332, "reward_std": 0.23300540447235107, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7420820593833923, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8109303712844849, "step": 17 }, { "adv/mean_abs_final_conf": 0.7605211734771729, "adv/mean_abs_reasoning": 0.3960456848144531, "adv/mean_abs_step_conf": 0.7509171962738037, "adv/ratio_final_to_reasoning": 1.9202864786507547, "adv/ratio_step_to_reasoning": 1.8960368085454773, "adv/std_final_conf": 0.9212812781333923, "adv/std_reasoning": 0.6815840005874634, "adv/std_step_conf": 0.9348611831665039, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4494174571846947, "calib/avg_num_step_conf": 5.0546875, "calib/ece": 0.3976984126984128, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9007936507936508, "calib/gap": -0.0002954096899471237, "calib/mean_conf": 0.946031746031746, "calib/mu_c": 0.9458992805755397, "calib/mu_w": 0.9461946902654869, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.39607142857142863, "calib/std_conf": 0.05627426270492683, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6379673590504451, "calib/step_q_c_n": 674.0, "calib/step_q_gap": 0.02185445582463863, "calib/step_q_w": 0.6161129032258065, "calib/step_q_w_n": 620.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2841.0, "completions/max_terminated_length": 2841.0, "completions/mean_length": 526.40234375, "completions/mean_terminated_length": 526.40234375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.0192, "grad_norm": 0.03876996040344238, "kl": 0.013779640197753906, "learning_rate": 4.5e-06, "loss": 0.0361, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032325707376003265, "mask/share_reasoning": 0.8605347871780396, "mask/share_step_conf": 0.10713949799537659, "num_tokens": 4274847.0, "reward": 0.8270844221115112, "reward_std": 0.1768120676279068, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5842535495758057, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7652277946472168, "step": 18 }, { "adv/mean_abs_final_conf": 0.7454897165298462, "adv/mean_abs_reasoning": 0.4756406545639038, "adv/mean_abs_step_conf": 0.7751795053482056, "adv/ratio_final_to_reasoning": 1.5673380931101366, "adv/ratio_step_to_reasoning": 1.6297587220733625, "adv/std_final_conf": 0.9180338382720947, "adv/std_reasoning": 0.7392441630363464, "adv/std_step_conf": 0.9347119331359863, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5674637862137862, "calib/avg_num_step_conf": 4.85546875, "calib/ece": 0.3841960784313725, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9372549019607843, "calib/gap": 0.030375249750249633, "calib/mean_conf": 0.9449803921568627, "calib/mu_c": 0.9583216783216782, "calib/mu_w": 0.9279464285714286, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3841960784313725, "calib/std_conf": 0.10404325838579108, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6270876671619613, "calib/step_q_c_n": 673.0, "calib/step_q_gap": 0.046859596986522645, "calib/step_q_w": 0.5802280701754386, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1751.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 479.44921875, "completions/mean_terminated_length": 481.3294372558594, "completions/min_length": 0.0, "completions/min_terminated_length": 213.0, "epoch": 0.020266666666666665, "grad_norm": 0.02486286498606205, "kl": 0.019275665283203125, "learning_rate": 4.75e-06, "loss": 0.0117, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03219831734895706, "mask/share_reasoning": 0.8544199466705322, "mask/share_step_conf": 0.10947546362876892, "num_tokens": 4502346.0, "reward": 0.8657012581825256, "reward_std": 0.19842995703220367, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6078425645828247, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8126223087310791, "step": 19 }, { "adv/mean_abs_final_conf": 0.75970858335495, "adv/mean_abs_reasoning": 0.42294591665267944, "adv/mean_abs_step_conf": 0.7696812152862549, "adv/ratio_final_to_reasoning": 1.7962310391066334, "adv/ratio_step_to_reasoning": 1.8198100158472799, "adv/std_final_conf": 0.9097000956535339, "adv/std_reasoning": 0.681745171546936, "adv/std_step_conf": 0.9345875382423401, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.3604950495049505, "calib/avg_num_step_conf": 5.48828125, "calib/ece": 0.36517928286852586, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9641434262948207, "calib/gap": -0.00992013201320141, "calib/mean_conf": 0.9587250996015938, "calib/mu_c": 0.9547333333333332, "calib/mu_w": 0.9646534653465346, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36314741035856574, "calib/std_conf": 0.03790279199381405, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5661548556430447, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.004475228893433547, "calib/step_q_w": 0.5616796267496111, "calib/step_q_w_n": 643.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2742.0, "completions/max_terminated_length": 2742.0, "completions/mean_length": 490.6875, "completions/mean_terminated_length": 492.6117858886719, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.021333333333333333, "grad_norm": 0.030515167862176895, "kl": 0.024442672729492188, "learning_rate": 5e-06, "loss": 0.04, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034816332161426544, "mask/share_reasoning": 0.833861231803894, "mask/share_step_conf": 0.1274162083864212, "num_tokens": 4732834.0, "reward": 0.8644837141036987, "reward_std": 0.1930766999721527, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6107491850852966, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.804936945438385, "step": 20 }, { "adv/mean_abs_final_conf": 0.7336472272872925, "adv/mean_abs_reasoning": 0.5508327484130859, "adv/mean_abs_step_conf": 0.7745312452316284, "adv/ratio_final_to_reasoning": 1.3318874547689536, "adv/ratio_step_to_reasoning": 1.4061096539067504, "adv/std_final_conf": 0.9031286835670471, "adv/std_reasoning": 0.7927908301353455, "adv/std_step_conf": 0.9346875548362732, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.539980449657869, "calib/avg_num_step_conf": 5.640625, "calib/ece": 0.3595669291338581, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.984251968503937, "calib/gap": 0.002987943955685801, "calib/mean_conf": 0.9698031496062991, "calib/mu_c": 0.9709677419354839, "calib/mu_w": 0.967979797979798, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3595669291338581, "calib/std_conf": 0.022438907215127788, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5120794392523365, "calib/step_q_c_n": 856.0, "calib/step_q_gap": 0.017419575306758195, "calib/step_q_w": 0.49465986394557826, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2372.0, "completions/max_terminated_length": 2372.0, "completions/mean_length": 503.39453125, "completions/mean_terminated_length": 505.36865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.0224, "grad_norm": 0.025241529569029808, "kl": 0.02947998046875, "learning_rate": 4.9722222222222224e-06, "loss": -0.0444, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03347921371459961, "mask/share_reasoning": 0.8384062647819519, "mask/share_step_conf": 0.12420830875635147, "num_tokens": 4964663.0, "reward": 0.8812829852104187, "reward_std": 0.22070921957492828, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6249253749847412, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8188904523849487, "step": 21 }, { "adv/mean_abs_final_conf": 0.7191415429115295, "adv/mean_abs_reasoning": 0.3771659731864929, "adv/mean_abs_step_conf": 0.7461881637573242, "adv/ratio_final_to_reasoning": 1.9066978307609523, "adv/ratio_step_to_reasoning": 1.978407960434875, "adv/std_final_conf": 0.8773278594017029, "adv/std_reasoning": 0.661241352558136, "adv/std_step_conf": 0.934755265712738, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5144485144485145, "calib/avg_num_step_conf": 5.9296875, "calib/ece": 0.3338735177865613, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9802371541501976, "calib/gap": -0.00047822547822551353, "calib/mean_conf": 0.9733201581027668, "calib/mu_c": 0.9731481481481482, "calib/mu_w": 0.9736263736263737, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33343873517786565, "calib/std_conf": 0.018545006380022704, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49158168574401667, "calib/step_q_c_n": 961.0, "calib/step_q_gap": 0.01014542003486052, "calib/step_q_w": 0.48143626570915615, "calib/step_q_w_n": 557.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2713.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 489.57421875, "completions/mean_terminated_length": 491.494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.023466666666666667, "grad_norm": 0.029216783121228218, "kl": 0.0421905517578125, "learning_rate": 4.944444444444445e-06, "loss": -0.0155, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03287845849990845, "mask/share_reasoning": 0.8300130367279053, "mask/share_step_conf": 0.1332021951675415, "num_tokens": 5191810.0, "reward": 0.894919753074646, "reward_std": 0.1613466739654541, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6505191326141357, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8151016235351562, "step": 22 }, { "adv/mean_abs_final_conf": 0.747708797454834, "adv/mean_abs_reasoning": 0.5173189640045166, "adv/mean_abs_step_conf": 0.7510793209075928, "adv/ratio_final_to_reasoning": 1.445353542941654, "adv/ratio_step_to_reasoning": 1.4518689109975007, "adv/std_final_conf": 0.8959153890609741, "adv/std_reasoning": 0.7575620412826538, "adv/std_step_conf": 0.9347652792930603, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4732358550540369, "calib/avg_num_step_conf": 5.79296875, "calib/ece": 0.4126482213438735, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": -0.002314685314685483, "calib/mean_conf": 0.9766007905138341, "calib/mu_c": 0.9755944055944054, "calib/mu_w": 0.9779090909090908, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41201581027667983, "calib/std_conf": 0.01956813169495786, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.48082932692307695, "calib/step_q_c_n": 832.0, "calib/step_q_gap": -0.006620749881838528, "calib/step_q_w": 0.4874500768049155, "calib/step_q_w_n": 651.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 517.4921875, "completions/mean_terminated_length": 517.4921875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.024533333333333334, "grad_norm": 0.03661978244781494, "kl": 0.046306610107421875, "learning_rate": 4.9166666666666665e-06, "loss": 0.0112, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.034062646329402924, "mask/share_reasoning": 0.8369054794311523, "mask/share_step_conf": 0.12903186678886414, "num_tokens": 5428224.0, "reward": 0.8458771109580994, "reward_std": 0.20304051041603088, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5766586065292358, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8049393892288208, "step": 23 }, { "adv/mean_abs_final_conf": 0.7730128169059753, "adv/mean_abs_reasoning": 0.614250123500824, "adv/mean_abs_step_conf": 0.7427619695663452, "adv/ratio_final_to_reasoning": 1.25846587136248, "adv/ratio_step_to_reasoning": 1.2092174525469979, "adv/std_final_conf": 0.9102962613105774, "adv/std_reasoning": 0.8267250657081604, "adv/std_step_conf": 0.9349828958511353, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5206280991735537, "calib/avg_num_step_conf": 6.4921875, "calib/ece": 0.486869918699187, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0016667768595042265, "calib/mean_conf": 0.978739837398374, "calib/mu_c": 0.9795867768595042, "calib/mu_w": 0.97792, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.486869918699187, "calib/std_conf": 0.01430068359181033, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4792125, "calib/step_q_c_n": 800.0, "calib/step_q_gap": 0.048354031322505764, "calib/step_q_w": 0.4308584686774942, "calib/step_q_w_n": 862.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2509.0, "completions/max_terminated_length": 2509.0, "completions/mean_length": 578.37890625, "completions/mean_terminated_length": 585.2371826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.0256, "grad_norm": 0.025873012840747833, "kl": 0.036952972412109375, "learning_rate": 4.888888888888889e-06, "loss": -0.0523, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03086896426975727, "mask/share_reasoning": 0.8307771682739258, "mask/share_step_conf": 0.1266351342201233, "num_tokens": 5680801.0, "reward": 0.7857183218002319, "reward_std": 0.24644377827644348, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.4935879111289978, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7911299467086792, "step": 24 }, { "adv/mean_abs_final_conf": 0.6677876710891724, "adv/mean_abs_reasoning": 0.44983309507369995, "adv/mean_abs_step_conf": 0.7818006277084351, "adv/ratio_final_to_reasoning": 1.4845232118365215, "adv/ratio_step_to_reasoning": 1.7379793444951979, "adv/std_final_conf": 0.854354739189148, "adv/std_reasoning": 0.7204803228378296, "adv/std_step_conf": 0.9348347187042236, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5566984126984127, "calib/avg_num_step_conf": 6.09765625, "calib/ece": 0.3904705882352941, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0023333333333330764, "calib/mean_conf": 0.9787058823529412, "calib/mu_c": 0.9796666666666666, "calib/mu_w": 0.9773333333333335, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3904705882352941, "calib/std_conf": 0.014907945134827336, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4798648648648648, "calib/step_q_c_n": 888.0, "calib/step_q_gap": 0.034456246737078755, "calib/step_q_w": 0.44540861812778604, "calib/step_q_w_n": 673.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2071.0, "completions/max_terminated_length": 2071.0, "completions/mean_length": 486.875, "completions/mean_terminated_length": 486.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.02666666666666667, "grad_norm": 0.03276247903704643, "kl": 0.05126953125, "learning_rate": 4.861111111111111e-06, "loss": 0.0223, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03289720416069031, "mask/share_reasoning": 0.8330222964286804, "mask/share_step_conf": 0.13408046960830688, "num_tokens": 5908665.0, "reward": 0.8634007573127747, "reward_std": 0.17636817693710327, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.603858232498169, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8065370321273804, "step": 25 }, { "adv/mean_abs_final_conf": 0.7102105617523193, "adv/mean_abs_reasoning": 0.4211689233779907, "adv/mean_abs_step_conf": 0.7700457572937012, "adv/ratio_final_to_reasoning": 1.6862843441915574, "adv/ratio_step_to_reasoning": 1.8283536950388917, "adv/std_final_conf": 0.867534339427948, "adv/std_reasoning": 0.6815800666809082, "adv/std_step_conf": 0.9347423911094666, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5649297201468912, "calib/avg_num_step_conf": 5.55859375, "calib/ece": 0.39337254901960783, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.00427440800303891, "calib/mean_conf": 0.9776862745098039, "calib/mu_c": 0.9794630872483221, "calib/mu_w": 0.9751886792452832, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39337254901960783, "calib/std_conf": 0.015254045038687006, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.507744966442953, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.035901308625843786, "calib/step_q_w": 0.4718436578171092, "calib/step_q_w_n": 678.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2112.0, "completions/max_terminated_length": 2112.0, "completions/mean_length": 490.828125, "completions/mean_terminated_length": 492.7529602050781, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.027733333333333332, "grad_norm": 0.06875115633010864, "kl": 0.29926300048828125, "learning_rate": 4.833333333333333e-06, "loss": 0.0166, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.030738497152924538, "mask/share_reasoning": 0.8481480479240417, "mask/share_step_conf": 0.11720723658800125, "num_tokens": 6139557.0, "reward": 0.8700515031814575, "reward_std": 0.16770586371421814, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6018503904342651, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8226275444030762, "step": 26 }, { "adv/mean_abs_final_conf": 0.7465525269508362, "adv/mean_abs_reasoning": 0.4980428218841553, "adv/mean_abs_step_conf": 0.7682787179946899, "adv/ratio_final_to_reasoning": 1.4989725665085165, "adv/ratio_step_to_reasoning": 1.5425957051006178, "adv/std_final_conf": 0.8857267498970032, "adv/std_reasoning": 0.7393344640731812, "adv/std_step_conf": 0.9349102973937988, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5512772351615327, "calib/avg_num_step_conf": 6.40234375, "calib/ece": 0.45450592885375496, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": 0.0043663911845729375, "calib/mean_conf": 0.9762450592885376, "calib/mu_c": 0.9783333333333335, "calib/mu_w": 0.9739669421487606, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45450592885375496, "calib/std_conf": 0.016978573607796924, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4405236270753512, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.0022993280099306568, "calib/step_q_w": 0.43822429906542054, "calib/step_q_w_n": 856.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2318.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 491.70703125, "completions/mean_terminated_length": 495.5787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.0288, "grad_norm": 0.13274186849594116, "kl": 0.061382293701171875, "learning_rate": 4.805555555555556e-06, "loss": -0.0484, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0327107235789299, "mask/share_reasoning": 0.8278404474258423, "mask/share_step_conf": 0.13163632154464722, "num_tokens": 6370650.0, "reward": 0.8293382525444031, "reward_std": 0.19650721549987793, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5393917560577393, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8185034990310669, "step": 27 }, { "adv/mean_abs_final_conf": 0.6987853050231934, "adv/mean_abs_reasoning": 0.4233437180519104, "adv/mean_abs_step_conf": 0.7611550688743591, "adv/ratio_final_to_reasoning": 1.6506334574628276, "adv/ratio_step_to_reasoning": 1.7979599942499356, "adv/std_final_conf": 0.8700273633003235, "adv/std_reasoning": 0.7013515830039978, "adv/std_step_conf": 0.934949517250061, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5388085399449037, "calib/avg_num_step_conf": 5.5546875, "calib/ece": 0.3256521739130434, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9881422924901185, "calib/gap": -0.0005681818181820786, "calib/mean_conf": 0.9747430830039526, "calib/mu_c": 0.9745454545454543, "calib/mu_w": 0.9751136363636363, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.32411067193675885, "calib/std_conf": 0.02992139242886999, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4748793859649123, "calib/step_q_c_n": 912.0, "calib/step_q_gap": 0.03242840557275545, "calib/step_q_w": 0.44245098039215686, "calib/step_q_w_n": 510.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 541.046875, "completions/mean_terminated_length": 541.046875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.029866666666666666, "grad_norm": 0.026259498670697212, "kl": 0.040096282958984375, "learning_rate": 4.777777777777778e-06, "loss": -0.0154, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03135649859905243, "mask/share_reasoning": 0.8519505858421326, "mask/share_step_conf": 0.11669294536113739, "num_tokens": 6616102.0, "reward": 0.9012293219566345, "reward_std": 0.18706245720386505, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.656219482421875, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8204576969146729, "step": 28 }, { "adv/mean_abs_final_conf": 0.7530903816223145, "adv/mean_abs_reasoning": 0.5109613537788391, "adv/mean_abs_step_conf": 0.7395081520080566, "adv/ratio_final_to_reasoning": 1.473869552076294, "adv/ratio_step_to_reasoning": 1.4472878360349344, "adv/std_final_conf": 0.9067643284797668, "adv/std_reasoning": 0.7575812339782715, "adv/std_step_conf": 0.9346757531166077, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5175736961451247, "calib/avg_num_step_conf": 6.69921875, "calib/ece": 0.47662698412698423, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.996031746031746, "calib/gap": -0.001825396825396619, "calib/mean_conf": 0.9726587301587301, "calib/mu_c": 0.9717460317460319, "calib/mu_w": 0.9735714285714285, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.47464285714285726, "calib/std_conf": 0.03489887038204902, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.45238341968911916, "calib/step_q_c_n": 772.0, "calib/step_q_gap": 0.05528904004967056, "calib/step_q_w": 0.3970943796394486, "calib/step_q_w_n": 943.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2779.0, "completions/max_terminated_length": 2779.0, "completions/mean_length": 581.3125, "completions/mean_terminated_length": 581.3125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.030933333333333334, "grad_norm": 0.02828669734299183, "kl": 0.03983306884765625, "learning_rate": 4.75e-06, "loss": -0.0191, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02909451350569725, "mask/share_reasoning": 0.8438331484794617, "mask/share_step_conf": 0.12707233428955078, "num_tokens": 6872046.0, "reward": 0.8204588890075684, "reward_std": 0.19906474649906158, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5123624801635742, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8340240716934204, "step": 29 }, { "adv/mean_abs_final_conf": 0.7167383432388306, "adv/mean_abs_reasoning": 0.5228696465492249, "adv/mean_abs_step_conf": 0.738788366317749, "adv/ratio_final_to_reasoning": 1.3707782579636782, "adv/ratio_step_to_reasoning": 1.4129494247629784, "adv/std_final_conf": 0.9092122912406921, "adv/std_reasoning": 0.7927682995796204, "adv/std_step_conf": 0.9349443316459656, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5244153600850386, "calib/avg_num_step_conf": 6.23828125, "calib/ece": 0.40125, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9879032258064516, "calib/gap": 0.0037244220037203624, "calib/mean_conf": 0.9738306451612904, "calib/mu_c": 0.9754225352112675, "calib/mu_w": 0.9716981132075472, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.40125, "calib/std_conf": 0.02089344000081456, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42900846432889966, "calib/step_q_c_n": 827.0, "calib/step_q_gap": 0.0022292435496789054, "calib/step_q_w": 0.42677922077922076, "calib/step_q_w_n": 770.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2605.0, "completions/max_terminated_length": 2605.0, "completions/mean_length": 588.99609375, "completions/mean_terminated_length": 593.6338500976562, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.032, "grad_norm": 0.02038068138062954, "kl": 0.049793243408203125, "learning_rate": 4.722222222222222e-06, "loss": -0.0398, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.028470445424318314, "mask/share_reasoning": 0.844284176826477, "mask/share_step_conf": 0.11943288147449493, "num_tokens": 7129813.0, "reward": 0.8432120084762573, "reward_std": 0.2119450569152832, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5770386457443237, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8046977519989014, "step": 30 }, { "adv/mean_abs_final_conf": 0.7576237916946411, "adv/mean_abs_reasoning": 0.43475398421287537, "adv/mean_abs_step_conf": 0.7570363283157349, "adv/ratio_final_to_reasoning": 1.7426494505077934, "adv/ratio_step_to_reasoning": 1.741298195774683, "adv/std_final_conf": 0.9100903272628784, "adv/std_reasoning": 0.7014799118041992, "adv/std_step_conf": 0.9344565868377686, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5342292089249493, "calib/avg_num_step_conf": 7.34765625, "calib/ece": 0.5067063492063492, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9722222222222222, "calib/gap": 0.010945740365111867, "calib/mean_conf": 0.9670238095238095, "calib/mu_c": 0.9729310344827589, "calib/mu_w": 0.961985294117647, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5067063492063492, "calib/std_conf": 0.06575169311426353, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4390456431535269, "calib/step_q_c_n": 723.0, "calib/step_q_gap": 0.0510145550706253, "calib/step_q_w": 0.3880310880829016, "calib/step_q_w_n": 1158.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2400.0, "completions/max_terminated_length": 2400.0, "completions/mean_length": 599.3359375, "completions/mean_terminated_length": 601.6863403320312, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.03306666666666667, "grad_norm": 0.023156926035881042, "kl": 0.037746429443359375, "learning_rate": 4.694444444444445e-06, "loss": -0.0181, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02954915352165699, "mask/share_reasoning": 0.840360701084137, "mask/share_step_conf": 0.12618383765220642, "num_tokens": 7389155.0, "reward": 0.8060630559921265, "reward_std": 0.19124513864517212, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.4881894588470459, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8364366292953491, "step": 31 }, { "adv/mean_abs_final_conf": 0.763539731502533, "adv/mean_abs_reasoning": 0.502423882484436, "adv/mean_abs_step_conf": 0.767042875289917, "adv/ratio_final_to_reasoning": 1.519712255171679, "adv/ratio_step_to_reasoning": 1.526684741770169, "adv/std_final_conf": 0.9136727452278137, "adv/std_reasoning": 0.7576688528060913, "adv/std_step_conf": 0.93479984998703, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5844725950081525, "calib/avg_num_step_conf": 6.0390625, "calib/ece": 0.4245849802371543, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.924901185770751, "calib/gap": 0.02855512354195411, "calib/mean_conf": 0.9440316205533598, "calib/mu_c": 0.9574626865671643, "calib/mu_w": 0.9289075630252102, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4194861660079053, "calib/std_conf": 0.12681391781318305, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4357412398921833, "calib/step_q_c_n": 742.0, "calib/step_q_gap": 0.05600243392203397, "calib/step_q_w": 0.3797388059701493, "calib/step_q_w_n": 804.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 504.4375, "completions/mean_terminated_length": 508.4094543457031, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.034133333333333335, "grad_norm": 0.02298598177731037, "kl": 0.048114776611328125, "learning_rate": 4.666666666666667e-06, "loss": -0.044, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03240440785884857, "mask/share_reasoning": 0.830779492855072, "mask/share_step_conf": 0.1290036290884018, "num_tokens": 7624995.0, "reward": 0.8541609048843384, "reward_std": 0.20546889305114746, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5703113079071045, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8364480137825012, "step": 32 }, { "adv/mean_abs_final_conf": 0.730973482131958, "adv/mean_abs_reasoning": 0.43391260504722595, "adv/mean_abs_step_conf": 0.7419459223747253, "adv/ratio_final_to_reasoning": 1.684609927504643, "adv/ratio_step_to_reasoning": 1.7098971399873801, "adv/std_final_conf": 0.894087553024292, "adv/std_reasoning": 0.701427698135376, "adv/std_step_conf": 0.9345289468765259, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5539067422810334, "calib/avg_num_step_conf": 6.52734375, "calib/ece": 0.40873517786561275, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9486166007905138, "calib/gap": 0.03398550724637672, "calib/mean_conf": 0.9541897233201582, "calib/mu_c": 0.9696376811594203, "calib/mu_w": 0.9356521739130436, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.40873517786561275, "calib/std_conf": 0.10689211607677195, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42857929515418497, "calib/step_q_c_n": 908.0, "calib/step_q_gap": 0.07017824666139333, "calib/step_q_w": 0.35840104849279164, "calib/step_q_w_n": 763.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2725.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 537.07421875, "completions/mean_terminated_length": 541.3031616210938, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.0352, "grad_norm": 0.019974276423454285, "kl": 0.048274993896484375, "learning_rate": 4.638888888888889e-06, "loss": -0.0733, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03093218244612217, "mask/share_reasoning": 0.8321001529693604, "mask/share_step_conf": 0.12915518879890442, "num_tokens": 7869358.0, "reward": 0.8696585893630981, "reward_std": 0.18201705813407898, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5835089683532715, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8503393530845642, "step": 33 }, { "adv/mean_abs_final_conf": 0.7161438465118408, "adv/mean_abs_reasoning": 0.5695993304252625, "adv/mean_abs_step_conf": 0.7828904390335083, "adv/ratio_final_to_reasoning": 1.2572764893827533, "adv/ratio_step_to_reasoning": 1.374458144901615, "adv/std_final_conf": 0.9155739545822144, "adv/std_reasoning": 0.8264137506484985, "adv/std_step_conf": 0.934232234954834, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4919909898635966, "calib/avg_num_step_conf": 6.81640625, "calib/ece": 0.43913043478260877, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9407114624505929, "calib/gap": 0.006639969966211967, "calib/mean_conf": 0.94901185770751, "calib/mu_c": 0.9522137404580152, "calib/mu_w": 0.9455737704918032, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4351778656126483, "calib/std_conf": 0.09978738196047238, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39701346389228886, "calib/step_q_c_n": 817.0, "calib/step_q_gap": 0.011119067340564726, "calib/step_q_w": 0.38589439655172414, "calib/step_q_w_n": 928.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2860.0, "completions/max_terminated_length": 2860.0, "completions/mean_length": 474.125, "completions/mean_terminated_length": 477.8582763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 191.0, "epoch": 0.03626666666666667, "grad_norm": 0.01743357814848423, "kl": 0.0554962158203125, "learning_rate": 4.611111111111112e-06, "loss": -0.0244, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.033780019730329514, "mask/share_reasoning": 0.8123932480812073, "mask/share_step_conf": 0.14601418375968933, "num_tokens": 8095846.0, "reward": 0.843841552734375, "reward_std": 0.19271723926067352, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5511835813522339, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8364993929862976, "step": 34 }, { "adv/mean_abs_final_conf": 0.7311309576034546, "adv/mean_abs_reasoning": 0.5093601942062378, "adv/mean_abs_step_conf": 0.752097487449646, "adv/ratio_final_to_reasoning": 1.4353908411371123, "adv/ratio_step_to_reasoning": 1.4765533231776744, "adv/std_final_conf": 0.9236873984336853, "adv/std_reasoning": 0.7753287553787231, "adv/std_step_conf": 0.934443473815918, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6312058971774194, "calib/avg_num_step_conf": 6.53125, "calib/ece": 0.44246031746031755, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8531746031746031, "calib/gap": 0.056030745967741935, "calib/mean_conf": 0.9345238095238095, "calib/mu_c": 0.962983870967742, "calib/mu_w": 0.906953125, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44246031746031755, "calib/std_conf": 0.1116620975814138, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3825062972292192, "calib/step_q_c_n": 794.0, "calib/step_q_gap": 0.027586023880699806, "calib/step_q_w": 0.3549202733485194, "calib/step_q_w_n": 878.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2681.0, "completions/max_terminated_length": 2681.0, "completions/mean_length": 548.6953125, "completions/mean_terminated_length": 555.2015991210938, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.037333333333333336, "grad_norm": 0.03290172666311264, "kl": 0.05770111083984375, "learning_rate": 4.583333333333333e-06, "loss": -0.0702, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.029511885717511177, "mask/share_reasoning": 0.8369863033294678, "mask/share_step_conf": 0.12178307771682739, "num_tokens": 8345568.0, "reward": 0.8452932834625244, "reward_std": 0.2060595154762268, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.5609281063079834, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8359084129333496, "step": 35 }, { "adv/mean_abs_final_conf": 0.7096432447433472, "adv/mean_abs_reasoning": 0.37983816862106323, "adv/mean_abs_step_conf": 0.7735100984573364, "adv/ratio_final_to_reasoning": 1.8682778703351066, "adv/ratio_step_to_reasoning": 2.0364201450987167, "adv/std_final_conf": 0.9167184829711914, "adv/std_reasoning": 0.6815183758735657, "adv/std_step_conf": 0.9348844885826111, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5985863095238095, "calib/avg_num_step_conf": 6.6796875, "calib/ece": 0.20118577075098812, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8260869565217391, "calib/gap": 0.04194857804232799, "calib/mean_conf": 0.9274308300395256, "calib/mu_c": 0.938042328042328, "calib/mu_w": 0.89609375, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19079051383399206, "calib/std_conf": 0.12201260285332109, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37776412776412777, "calib/step_q_c_n": 1221.0, "calib/step_q_gap": 0.007661878275375178, "calib/step_q_w": 0.3701022494887526, "calib/step_q_w_n": 489.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 487.57421875, "completions/mean_terminated_length": 487.57421875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.0384, "grad_norm": 0.04586457833647728, "kl": 0.0617828369140625, "learning_rate": 4.555555555555556e-06, "loss": 0.0154, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03654136136174202, "mask/share_reasoning": 0.8108397722244263, "mask/share_step_conf": 0.15261885523796082, "num_tokens": 8573099.0, "reward": 0.9648346304893494, "reward_std": 0.16412945091724396, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7703171372413635, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8140394687652588, "step": 36 }, { "adv/mean_abs_final_conf": 0.7422738075256348, "adv/mean_abs_reasoning": 0.40673696994781494, "adv/mean_abs_step_conf": 0.7679119110107422, "adv/ratio_final_to_reasoning": 1.8249479697428779, "adv/ratio_step_to_reasoning": 1.8879815894514498, "adv/std_final_conf": 0.9150478839874268, "adv/std_reasoning": 0.6613951921463013, "adv/std_step_conf": 0.9349683523178101, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6805715057758562, "calib/avg_num_step_conf": 6.44140625, "calib/ece": 0.4525000000000001, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7622950819672131, "calib/gap": 0.05334797000608005, "calib/mean_conf": 0.9090573770491803, "calib/mu_c": 0.937699115044248, "calib/mu_w": 0.8843511450381679, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44922131147540995, "calib/std_conf": 0.13716215120895295, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37116319444444446, "calib/step_q_c_n": 576.0, "calib/step_q_gap": 0.06508677319560946, "calib/step_q_w": 0.306076421248835, "calib/step_q_w_n": 1073.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2687.0, "completions/max_terminated_length": 2687.0, "completions/mean_length": 547.22265625, "completions/mean_terminated_length": 553.7114868164062, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.039466666666666664, "grad_norm": 0.030891193076968193, "kl": 0.056957244873046875, "learning_rate": 4.527777777777778e-06, "loss": 0.0362, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03294394165277481, "mask/share_reasoning": 0.8250966668128967, "mask/share_step_conf": 0.13024061918258667, "num_tokens": 8820284.0, "reward": 0.8101900219917297, "reward_std": 0.18450546264648438, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.5339511632919312, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8075225353240967, "step": 37 }, { "adv/mean_abs_final_conf": 0.726380467414856, "adv/mean_abs_reasoning": 0.43823903799057007, "adv/mean_abs_step_conf": 0.7534304261207581, "adv/ratio_final_to_reasoning": 1.6574983158631478, "adv/ratio_step_to_reasoning": 1.7192225265357812, "adv/std_final_conf": 0.9267221093177795, "adv/std_reasoning": 0.7014729976654053, "adv/std_step_conf": 0.9348070025444031, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6114302310571834, "calib/avg_num_step_conf": 6.2578125, "calib/ece": 0.4103212851405621, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7349397590361446, "calib/gap": 0.047305408545243455, "calib/mean_conf": 0.892570281124498, "calib/mu_c": 0.9157480314960631, "calib/mu_w": 0.8684426229508196, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3964257028112448, "calib/std_conf": 0.17416876619184174, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.35076177285318555, "calib/step_q_c_n": 722.0, "calib/step_q_gap": 0.03461404558045822, "calib/step_q_w": 0.31614772727272733, "calib/step_q_w_n": 880.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2035.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 538.00390625, "completions/mean_terminated_length": 542.2401733398438, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.04053333333333333, "grad_norm": 0.02675260789692402, "kl": 0.05478668212890625, "learning_rate": 4.5e-06, "loss": -0.0246, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.031882841140031815, "mask/share_reasoning": 0.8331516981124878, "mask/share_step_conf": 0.12715290486812592, "num_tokens": 9064901.0, "reward": 0.8506743907928467, "reward_std": 0.1840541660785675, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.58075350522995, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8260639309883118, "step": 38 }, { "adv/mean_abs_final_conf": 0.7293663024902344, "adv/mean_abs_reasoning": 0.45697662234306335, "adv/mean_abs_step_conf": 0.7594219446182251, "adv/ratio_final_to_reasoning": 1.5960691790983599, "adv/ratio_step_to_reasoning": 1.661839812996186, "adv/std_final_conf": 0.9291202425956726, "adv/std_reasoning": 0.7205585241317749, "adv/std_step_conf": 0.9344246983528137, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5710066769388804, "calib/avg_num_step_conf": 6.6640625, "calib/ece": 0.36484000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.664, "calib/gap": 0.047203389830508535, "calib/mean_conf": 0.85772, "calib/mu_c": 0.88, "calib/mu_w": 0.8327966101694915, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34728000000000003, "calib/std_conf": 0.21252012045921675, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37581534772182257, "calib/step_q_c_n": 834.0, "calib/step_q_gap": 0.06166397157503367, "calib/step_q_w": 0.3141513761467889, "calib/step_q_w_n": 872.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 518.83984375, "completions/mean_terminated_length": 522.9251708984375, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0416, "grad_norm": 0.025498710572719574, "kl": 0.05701446533203125, "learning_rate": 4.472222222222223e-06, "loss": 0.0204, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03186484053730965, "mask/share_reasoning": 0.8250347375869751, "mask/share_step_conf": 0.13528786599636078, "num_tokens": 9303812.0, "reward": 0.8701096773147583, "reward_std": 0.1742296665906906, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6058902740478516, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8358914852142334, "step": 39 }, { "adv/mean_abs_final_conf": 0.774293065071106, "adv/mean_abs_reasoning": 0.46777427196502686, "adv/mean_abs_step_conf": 0.7591378688812256, "adv/ratio_final_to_reasoning": 1.6552707394924788, "adv/ratio_step_to_reasoning": 1.6228722150370478, "adv/std_final_conf": 0.935425341129303, "adv/std_reasoning": 0.7205560803413391, "adv/std_step_conf": 0.9340584874153137, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.51763916015625, "calib/avg_num_step_conf": 6.7578125, "calib/ece": 0.4000390625, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.58984375, "calib/gap": 0.010703125000000036, "calib/mean_conf": 0.8308984374999999, "calib/mu_c": 0.83625, "calib/mu_w": 0.825546875, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36546875, "calib/std_conf": 0.21778082947095823, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3220449172576832, "calib/step_q_c_n": 846.0, "calib/step_q_gap": -0.025658702651819076, "calib/step_q_w": 0.34770361990950227, "calib/step_q_w_n": 884.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 523.01171875, "completions/mean_terminated_length": 525.0628051757812, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.042666666666666665, "grad_norm": 0.025849351659417152, "kl": 0.06219482421875, "learning_rate": 4.444444444444444e-06, "loss": -0.0527, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.033173397183418274, "mask/share_reasoning": 0.8261204957962036, "mask/share_step_conf": 0.13679982721805573, "num_tokens": 9544463.0, "reward": 0.8790460824966431, "reward_std": 0.15888527035713196, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5984293222427368, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8596628904342651, "step": 40 }, { "adv/mean_abs_final_conf": 0.7439480423927307, "adv/mean_abs_reasoning": 0.42733556032180786, "adv/mean_abs_step_conf": 0.7698180675506592, "adv/ratio_final_to_reasoning": 1.7408989830673014, "adv/ratio_step_to_reasoning": 1.8014369479828511, "adv/std_final_conf": 0.9251158237457275, "adv/std_reasoning": 0.701311469078064, "adv/std_step_conf": 0.9340131282806396, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6641814975148307, "calib/avg_num_step_conf": 5.9296875, "calib/ece": 0.16066666666666657, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5137254901960784, "calib/gap": 0.11002164502164502, "calib/mean_conf": 0.8019999999999999, "calib/mu_c": 0.8304761904761905, "calib/mu_w": 0.7204545454545455, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11074509803921559, "calib/std_conf": 0.24876542222366363, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3597395348837209, "calib/step_q_c_n": 1075.0, "calib/step_q_gap": 0.01599235655414971, "calib/step_q_w": 0.3437471783295712, "calib/step_q_w_n": 443.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1069.0, "completions/max_terminated_length": 1069.0, "completions/mean_length": 463.296875, "completions/mean_terminated_length": 465.1137390136719, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.04373333333333333, "grad_norm": 0.05439276620745659, "kl": 0.080596923828125, "learning_rate": 4.416666666666667e-06, "loss": -0.0363, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03658699989318848, "mask/share_reasoning": 0.8209108114242554, "mask/share_step_conf": 0.13859596848487854, "num_tokens": 9770315.0, "reward": 0.9843002557754517, "reward_std": 0.14363425970077515, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7817285060882568, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8399968147277832, "step": 41 }, { "adv/mean_abs_final_conf": 0.7876873016357422, "adv/mean_abs_reasoning": 0.45011699199676514, "adv/mean_abs_step_conf": 0.7809913158416748, "adv/ratio_final_to_reasoning": 1.749961267050774, "adv/ratio_step_to_reasoning": 1.7350851661411786, "adv/std_final_conf": 0.923464834690094, "adv/std_reasoning": 0.7013947367668152, "adv/std_step_conf": 0.9337511658668518, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6822884207902281, "calib/avg_num_step_conf": 6.28515625, "calib/ece": 0.2839607843137254, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5568627450980392, "calib/gap": 0.11492583821513147, "calib/mean_conf": 0.8147058823529412, "calib/mu_c": 0.8656338028169013, "calib/mu_w": 0.7507079646017698, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27090196078431367, "calib/std_conf": 0.2344367713677176, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3854927884615385, "calib/step_q_c_n": 832.0, "calib/step_q_gap": 0.020280433249183327, "calib/step_q_w": 0.3652123552123552, "calib/step_q_w_n": 777.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2811.0, "completions/max_terminated_length": 2811.0, "completions/mean_length": 445.33203125, "completions/mean_terminated_length": 445.33203125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.0448, "grad_norm": 0.03572205826640129, "kl": 0.067230224609375, "learning_rate": 4.388888888888889e-06, "loss": 0.058, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03780219703912735, "mask/share_reasoning": 0.8130389451980591, "mask/share_step_conf": 0.14915883541107178, "num_tokens": 9988688.0, "reward": 0.926224946975708, "reward_std": 0.15658482909202576, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6858199238777161, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8564735651016235, "step": 42 }, { "adv/mean_abs_final_conf": 0.7961469888687134, "adv/mean_abs_reasoning": 0.6136384606361389, "adv/mean_abs_step_conf": 0.7659261226654053, "adv/ratio_final_to_reasoning": 1.2974202888837407, "adv/ratio_step_to_reasoning": 1.248171638184795, "adv/std_final_conf": 0.9268906712532043, "adv/std_reasoning": 0.8099459409713745, "adv/std_step_conf": 0.9335736632347107, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6772471364783537, "calib/avg_num_step_conf": 5.8515625, "calib/ece": 0.2553149606299212, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.562992125984252, "calib/gap": 0.15169934640522875, "calib/mean_conf": 0.8013779527559054, "calib/mu_c": 0.8616993464052288, "calib/mu_w": 0.7100000000000001, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22716535433070859, "calib/std_conf": 0.26301116497165244, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3862155688622755, "calib/step_q_c_n": 835.0, "calib/step_q_gap": 0.012082838847192579, "calib/step_q_w": 0.3741327300150829, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2298.0, "completions/max_terminated_length": 2298.0, "completions/mean_length": 501.875, "completions/mean_terminated_length": 501.875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.04586666666666667, "grad_norm": 0.03204227611422539, "kl": 0.05794525146484375, "learning_rate": 4.361111111111112e-06, "loss": -0.0227, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03475869074463844, "mask/share_reasoning": 0.8334618210792542, "mask/share_step_conf": 0.1317795068025589, "num_tokens": 10222392.0, "reward": 0.9406875967979431, "reward_std": 0.20013371109962463, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7148027420043945, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8493848443031311, "step": 43 }, { "adv/mean_abs_final_conf": 0.6989273428916931, "adv/mean_abs_reasoning": 0.3648616373538971, "adv/mean_abs_step_conf": 0.7579975128173828, "adv/ratio_final_to_reasoning": 1.9155955884004583, "adv/ratio_step_to_reasoning": 2.0774930417860404, "adv/std_final_conf": 0.8844950199127197, "adv/std_reasoning": 0.6611524224281311, "adv/std_step_conf": 0.9328915476799011, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7685390490268539, "calib/avg_num_step_conf": 6.52734375, "calib/ece": 0.3349803921568627, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6392156862745098, "calib/gap": 0.23194382852919448, "calib/mean_conf": 0.8173333333333332, "calib/mu_c": 0.9373983739837399, "calib/mu_w": 0.7054545454545454, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349803921568627, "calib/std_conf": 0.27277503958797666, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4172826086956522, "calib/step_q_c_n": 736.0, "calib/step_q_gap": 0.04115426644966291, "calib/step_q_w": 0.3761283422459893, "calib/step_q_w_n": 935.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2344.0, "completions/max_terminated_length": 2344.0, "completions/mean_length": 528.828125, "completions/mean_terminated_length": 528.828125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.046933333333333334, "grad_norm": 0.051673296838998795, "kl": 0.0508880615234375, "learning_rate": 4.333333333333334e-06, "loss": 0.0202, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.031590502709150314, "mask/share_reasoning": 0.83428955078125, "mask/share_step_conf": 0.13411997258663177, "num_tokens": 10464092.0, "reward": 0.9205601811408997, "reward_std": 0.1538148820400238, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.6768664121627808, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8689414262771606, "step": 44 }, { "adv/mean_abs_final_conf": 0.7376207709312439, "adv/mean_abs_reasoning": 0.5290188193321228, "adv/mean_abs_step_conf": 0.7491678595542908, "adv/ratio_final_to_reasoning": 1.3943185837178298, "adv/ratio_step_to_reasoning": 1.4161459520477975, "adv/std_final_conf": 0.9091355800628662, "adv/std_reasoning": 0.792652428150177, "adv/std_step_conf": 0.9330114722251892, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6921934925151553, "calib/avg_num_step_conf": 6.4609375, "calib/ece": 0.3495294117647061, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7294117647058823, "calib/gap": 0.13181306445626606, "calib/mean_conf": 0.8409019607843138, "calib/mu_c": 0.9018978102189781, "calib/mu_w": 0.770084745762712, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.32658823529411785, "calib/std_conf": 0.26638533602857584, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4050833333333333, "calib/step_q_c_n": 840.0, "calib/step_q_gap": 0.051410114660114636, "calib/step_q_w": 0.35367321867321866, "calib/step_q_w_n": 814.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1282.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 473.85546875, "completions/mean_terminated_length": 475.7137451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.048, "grad_norm": 0.03942892327904701, "kl": 0.0570831298828125, "learning_rate": 4.305555555555556e-06, "loss": -0.0324, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03653804212808609, "mask/share_reasoning": 0.8149880170822144, "mask/share_step_conf": 0.14456769824028015, "num_tokens": 10690447.0, "reward": 0.912110447883606, "reward_std": 0.1834811270236969, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6512120962142944, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8667587637901306, "step": 45 }, { "adv/mean_abs_final_conf": 0.7239193916320801, "adv/mean_abs_reasoning": 0.3032967746257782, "adv/mean_abs_step_conf": 0.7477271556854248, "adv/ratio_final_to_reasoning": 2.386835113974706, "adv/ratio_step_to_reasoning": 2.465331708878229, "adv/std_final_conf": 0.8901360034942627, "adv/std_reasoning": 0.5960788130760193, "adv/std_step_conf": 0.9325743913650513, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6502356637863316, "calib/avg_num_step_conf": 7.09375, "calib/ece": 0.3412096774193549, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6895161290322581, "calib/gap": 0.0969939774810159, "calib/mean_conf": 0.8285483870967743, "calib/mu_c": 0.8731343283582089, "calib/mu_w": 0.776140350877193, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3147177419354839, "calib/std_conf": 0.2664240300493203, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3975352112676057, "calib/step_q_c_n": 852.0, "calib/step_q_gap": 0.030522763134825603, "calib/step_q_w": 0.3670124481327801, "calib/step_q_w_n": 964.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3039.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 558.02734375, "completions/mean_terminated_length": 558.02734375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.04906666666666667, "grad_norm": 0.037199702113866806, "kl": 0.04779052734375, "learning_rate": 4.277777777777778e-06, "loss": 0.0717, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0372079461812973, "mask/share_reasoning": 0.8186650276184082, "mask/share_step_conf": 0.1441270411014557, "num_tokens": 10938070.0, "reward": 0.8839125633239746, "reward_std": 0.15215706825256348, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6255718469619751, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8438156247138977, "step": 46 }, { "adv/mean_abs_final_conf": 0.7109636068344116, "adv/mean_abs_reasoning": 0.4669240117073059, "adv/mean_abs_step_conf": 0.7571384906768799, "adv/ratio_final_to_reasoning": 1.522653770224358, "adv/ratio_step_to_reasoning": 1.6215454157270812, "adv/std_final_conf": 0.8986199498176575, "adv/std_reasoning": 0.7392911314964294, "adv/std_step_conf": 0.9324113130569458, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7390342439399769, "calib/avg_num_step_conf": 6.8046875, "calib/ece": 0.2174103585657371, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5338645418326693, "calib/gap": 0.2485314864691549, "calib/mean_conf": 0.7427490039840637, "calib/mu_c": 0.8546376811594203, "calib/mu_w": 0.6061061946902654, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20517928286852594, "calib/std_conf": 0.30816642672872885, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3618954248366013, "calib/step_q_c_n": 918.0, "calib/step_q_gap": 0.029176978234659534, "calib/step_q_w": 0.3327184466019418, "calib/step_q_w_n": 824.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 526.5, "completions/mean_terminated_length": 530.6456909179688, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.050133333333333335, "grad_norm": 0.04363685101270676, "kl": 0.05706024169921875, "learning_rate": 4.25e-06, "loss": -0.0684, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032928965985774994, "mask/share_reasoning": 0.8188336491584778, "mask/share_step_conf": 0.14042489230632782, "num_tokens": 11178830.0, "reward": 0.9456167817115784, "reward_std": 0.15967297554016113, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7287996411323547, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.858527660369873, "step": 47 }, { "adv/mean_abs_final_conf": 0.7372201681137085, "adv/mean_abs_reasoning": 0.5289657115936279, "adv/mean_abs_step_conf": 0.7410778999328613, "adv/ratio_final_to_reasoning": 1.3937012399020483, "adv/ratio_step_to_reasoning": 1.400994211326473, "adv/std_final_conf": 0.9006651043891907, "adv/std_reasoning": 0.7576570510864258, "adv/std_step_conf": 0.933323860168457, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7045367656429208, "calib/avg_num_step_conf": 5.6875, "calib/ece": 0.23976095617529883, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4581673306772908, "calib/gap": 0.21290811775200735, "calib/mean_conf": 0.6956972111553785, "calib/mu_c": 0.7957894736842106, "calib/mu_w": 0.5828813559322032, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20278884462151398, "calib/std_conf": 0.3239236112549953, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3789867841409692, "calib/step_q_c_n": 681.0, "calib/step_q_gap": 0.040986784140969124, "calib/step_q_w": 0.3380000000000001, "calib/step_q_w_n": 775.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 458.0234375, "completions/mean_terminated_length": 463.4545593261719, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.0512, "grad_norm": 0.055858712643384933, "kl": 0.08083343505859375, "learning_rate": 4.222222222222223e-06, "loss": -0.0699, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03893757984042168, "mask/share_reasoning": 0.8088648319244385, "mask/share_step_conf": 0.14047878980636597, "num_tokens": 11399772.0, "reward": 0.9301211833953857, "reward_std": 0.15484796464443207, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7103937268257141, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8498485088348389, "step": 48 }, { "adv/mean_abs_final_conf": 0.6789600253105164, "adv/mean_abs_reasoning": 0.46352851390838623, "adv/mean_abs_step_conf": 0.7762776017189026, "adv/ratio_final_to_reasoning": 1.4647643131717005, "adv/ratio_step_to_reasoning": 1.6747138060040239, "adv/std_final_conf": 0.8645612597465515, "adv/std_reasoning": 0.7206313610076904, "adv/std_step_conf": 0.9317847490310669, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7854982817869415, "calib/avg_num_step_conf": 6.4765625, "calib/ece": 0.1580566801619434, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4939271255060729, "calib/gap": 0.31128934707903766, "calib/mean_conf": 0.7016194331983806, "calib/mu_c": 0.8238666666666666, "calib/mu_w": 0.512577319587629, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.12619433198380575, "calib/std_conf": 0.3236691982653765, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.37662556053811663, "calib/step_q_c_n": 892.0, "calib/step_q_gap": 0.035450625812268044, "calib/step_q_w": 0.3411749347258486, "calib/step_q_w_n": 766.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3071.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 504.3828125, "completions/mean_terminated_length": 506.3608093261719, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.05226666666666667, "grad_norm": 0.03892216458916664, "kl": 0.057281494140625, "learning_rate": 4.194444444444445e-06, "loss": -0.0143, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03590047359466553, "mask/share_reasoning": 0.8158446550369263, "mask/share_step_conf": 0.14434868097305298, "num_tokens": 11633430.0, "reward": 0.9651601314544678, "reward_std": 0.15499456226825714, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7683327794075012, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8510499596595764, "step": 49 }, { "adv/mean_abs_final_conf": 0.6920836567878723, "adv/mean_abs_reasoning": 0.39064478874206543, "adv/mean_abs_step_conf": 0.7383641600608826, "adv/ratio_final_to_reasoning": 1.7716444113243774, "adv/ratio_step_to_reasoning": 1.8901164980045566, "adv/std_final_conf": 0.8895038366317749, "adv/std_reasoning": 0.6613850593566895, "adv/std_step_conf": 0.9324697256088257, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7615255376344086, "calib/avg_num_step_conf": 5.87890625, "calib/ece": 0.16195219123505986, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4302788844621514, "calib/gap": 0.3052594086021505, "calib/mean_conf": 0.6362151394422311, "calib/mu_c": 0.7529677419354839, "calib/mu_w": 0.4477083333333334, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09031872509960168, "calib/std_conf": 0.3449678337246269, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3922669735327963, "calib/step_q_c_n": 869.0, "calib/step_q_gap": 0.06339904900449439, "calib/step_q_w": 0.32886792452830194, "calib/step_q_w_n": 636.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2308.0, "completions/max_terminated_length": 2308.0, "completions/mean_length": 470.5546875, "completions/mean_terminated_length": 472.4000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.05333333333333334, "grad_norm": 0.0452268086373806, "kl": 0.0612335205078125, "learning_rate": 4.166666666666667e-06, "loss": -0.028, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03737880289554596, "mask/share_reasoning": 0.8206923604011536, "mask/share_step_conf": 0.13802257180213928, "num_tokens": 11859252.0, "reward": 0.9782302379608154, "reward_std": 0.15078993141651154, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7732542753219604, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8652373552322388, "step": 50 }, { "adv/mean_abs_final_conf": 0.6727511882781982, "adv/mean_abs_reasoning": 0.41828539967536926, "adv/mean_abs_step_conf": 0.7507631778717041, "adv/ratio_final_to_reasoning": 1.608354460376383, "adv/ratio_step_to_reasoning": 1.7948586741358183, "adv/std_final_conf": 0.8566681146621704, "adv/std_reasoning": 0.7013798952102661, "adv/std_step_conf": 0.9328375458717346, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7685499186109604, "calib/avg_num_step_conf": 5.671875, "calib/ece": 0.15409638554216873, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.3895582329317269, "calib/gap": 0.33237791644058595, "calib/mean_conf": 0.6095983935742971, "calib/mu_c": 0.7390789473684211, "calib/mu_w": 0.4067010309278351, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07662650602409644, "calib/std_conf": 0.36308893353011723, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3618518518518518, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.06986380999833908, "calib/step_q_w": 0.29198804185351274, "calib/step_q_w_n": 669.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2437.0, "completions/max_terminated_length": 2437.0, "completions/mean_length": 479.69921875, "completions/mean_terminated_length": 481.5804138183594, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.0544, "grad_norm": 0.05431721359491348, "kl": 0.06055450439453125, "learning_rate": 4.138888888888889e-06, "loss": 0.0452, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0363931804895401, "mask/share_reasoning": 0.8310154676437378, "mask/share_step_conf": 0.1286850869655609, "num_tokens": 12091351.0, "reward": 0.9718501567840576, "reward_std": 0.16381707787513733, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7668848037719727, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8635340929031372, "step": 51 }, { "adv/mean_abs_final_conf": 0.6627511978149414, "adv/mean_abs_reasoning": 0.4197141230106354, "adv/mean_abs_step_conf": 0.7539057731628418, "adv/ratio_final_to_reasoning": 1.579053840411626, "adv/ratio_step_to_reasoning": 1.796236370973245, "adv/std_final_conf": 0.8806871175765991, "adv/std_reasoning": 0.7013139724731445, "adv/std_step_conf": 0.9320465922355652, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6909056316590563, "calib/avg_num_step_conf": 5.515625, "calib/ece": 0.2622529644268774, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.43873517786561267, "calib/gap": 0.28380289193302904, "calib/mean_conf": 0.5797233201581028, "calib/mu_c": 0.6616111111111113, "calib/mu_w": 0.3778082191780822, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06525691699604735, "calib/std_conf": 0.3985074953417087, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3857852077001013, "calib/step_q_c_n": 987.0, "calib/step_q_gap": 0.07912638417068957, "calib/step_q_w": 0.3066588235294117, "calib/step_q_w_n": 425.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1642.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 445.98046875, "completions/mean_terminated_length": 447.72943115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.055466666666666664, "grad_norm": 0.08861194550991058, "kl": 0.072235107421875, "learning_rate": 4.111111111111111e-06, "loss": -0.0165, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.038540616631507874, "mask/share_reasoning": 0.8251634240150452, "mask/share_step_conf": 0.13238969445228577, "num_tokens": 12313474.0, "reward": 0.969064474105835, "reward_std": 0.1488306224346161, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7264589667320251, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8726074695587158, "step": 52 }, { "adv/mean_abs_final_conf": 0.5378075838088989, "adv/mean_abs_reasoning": 0.3977751135826111, "adv/mean_abs_step_conf": 0.7320243120193481, "adv/ratio_final_to_reasoning": 1.3520392941757164, "adv/ratio_step_to_reasoning": 1.8402969090405883, "adv/std_final_conf": 0.7663177251815796, "adv/std_reasoning": 0.6815900802612305, "adv/std_step_conf": 0.9331000447273254, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6817835365853658, "calib/avg_num_step_conf": 5.8671875, "calib/ece": 0.2257936507936507, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6547619047619048, "calib/gap": 0.24677383592017754, "calib/mean_conf": 0.7565079365079367, "calib/mu_c": 0.8426829268292684, "calib/mu_w": 0.5959090909090908, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16575396825396815, "calib/std_conf": 0.3512693929805845, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3837, "calib/step_q_c_n": 1000.0, "calib/step_q_gap": 0.03826175298804774, "calib/step_q_w": 0.34543824701195225, "calib/step_q_w_n": 502.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2711.0, "completions/max_terminated_length": 2711.0, "completions/mean_length": 491.625, "completions/mean_terminated_length": 493.552978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.05653333333333333, "grad_norm": 0.06184534728527069, "kl": 0.0595245361328125, "learning_rate": 4.083333333333334e-06, "loss": -0.0531, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034944549202919006, "mask/share_reasoning": 0.830845832824707, "mask/share_step_conf": 0.13030338287353516, "num_tokens": 12545154.0, "reward": 0.9501216411590576, "reward_std": 0.14969132840633392, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7386132478713989, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.83663010597229, "step": 53 }, { "adv/mean_abs_final_conf": 0.38943132758140564, "adv/mean_abs_reasoning": 0.22378817200660706, "adv/mean_abs_step_conf": 0.7389417290687561, "adv/ratio_final_to_reasoning": 1.7401783306488074, "adv/ratio_step_to_reasoning": 3.301969547554729, "adv/std_final_conf": 0.6646291017532349, "adv/std_reasoning": 0.4960247576236725, "adv/std_step_conf": 0.9332090020179749, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7655405405405405, "calib/avg_num_step_conf": 5.46484375, "calib/ece": 0.1812204724409449, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7952755905511811, "calib/gap": 0.2853543543543544, "calib/mean_conf": 0.873976377952756, "calib/mu_c": 0.9571111111111111, "calib/mu_w": 0.6717567567567567, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1732677165354331, "calib/std_conf": 0.25683938927458877, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.43114044350580777, "calib/step_q_c_n": 947.0, "calib/step_q_gap": 0.0920917709394361, "calib/step_q_w": 0.33904867256637167, "calib/step_q_w_n": 452.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2314.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 420.96875, "completions/mean_terminated_length": 424.2834777832031, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.0576, "grad_norm": 0.04072105139493942, "kl": 0.06040191650390625, "learning_rate": 4.055555555555556e-06, "loss": -0.04, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04213773459196091, "mask/share_reasoning": 0.8122613430023193, "mask/share_step_conf": 0.13778847455978394, "num_tokens": 12759154.0, "reward": 1.0076714754104614, "reward_std": 0.11803492158651352, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.8116816282272339, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8645989298820496, "step": 54 }, { "adv/mean_abs_final_conf": 0.5672956109046936, "adv/mean_abs_reasoning": 0.4686982035636902, "adv/mean_abs_step_conf": 0.7626444101333618, "adv/ratio_final_to_reasoning": 1.2103643807280036, "adv/ratio_step_to_reasoning": 1.6271545406717738, "adv/std_final_conf": 0.7936248183250427, "adv/std_reasoning": 0.7205504179000854, "adv/std_step_conf": 0.9332450032234192, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7190927218344966, "calib/avg_num_step_conf": 5.30859375, "calib/ece": 0.2828346456692915, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7244094488188977, "calib/gap": 0.26269566301096736, "calib/mean_conf": 0.8136220472440947, "calib/mu_c": 0.9356617647058825, "calib/mu_w": 0.6729661016949151, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2805118110236222, "calib/std_conf": 0.3073064829686301, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4221771771771771, "calib/step_q_c_n": 666.0, "calib/step_q_gap": 0.0822781872781872, "calib/step_q_w": 0.3398989898989899, "calib/step_q_w_n": 693.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2751.0, "completions/max_terminated_length": 2751.0, "completions/mean_length": 463.44140625, "completions/mean_terminated_length": 463.44140625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.058666666666666666, "grad_norm": 0.04620984569191933, "kl": 0.06476593017578125, "learning_rate": 4.027777777777779e-06, "loss": -0.0361, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03952161967754364, "mask/share_reasoning": 0.8331817388534546, "mask/share_step_conf": 0.12729665637016296, "num_tokens": 12985619.0, "reward": 0.9384989142417908, "reward_std": 0.17439204454421997, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7045695185661316, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.86774080991745, "step": 55 }, { "adv/mean_abs_final_conf": 0.5335018634796143, "adv/mean_abs_reasoning": 0.4167119562625885, "adv/mean_abs_step_conf": 0.752956748008728, "adv/ratio_final_to_reasoning": 1.2802653138741027, "adv/ratio_step_to_reasoning": 1.8068997941932265, "adv/std_final_conf": 0.7589111328125, "adv/std_reasoning": 0.6816851496696472, "adv/std_step_conf": 0.9334322810173035, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6513919299343133, "calib/avg_num_step_conf": 5.55859375, "calib/ece": 0.35795275590551184, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8503937007874016, "calib/gap": 0.1445192367844853, "calib/mean_conf": 0.9029133858267718, "calib/mu_c": 0.9683453237410071, "calib/mu_w": 0.8238260869565218, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3568110236220473, "calib/std_conf": 0.22530333091896765, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44694148936170214, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.05115907505469758, "calib/step_q_w": 0.39578241430700456, "calib/step_q_w_n": 671.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 444.484375, "completions/mean_terminated_length": 446.22747802734375, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.05973333333333333, "grad_norm": 0.06583019345998764, "kl": 0.0554656982421875, "learning_rate": 4.000000000000001e-06, "loss": -0.0506, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.037847816944122314, "mask/share_reasoning": 0.825727105140686, "mask/share_step_conf": 0.13251882791519165, "num_tokens": 13206247.0, "reward": 0.9049012064933777, "reward_std": 0.16697609424591064, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6415327787399292, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8612383604049683, "step": 56 }, { "adv/mean_abs_final_conf": 0.5082446932792664, "adv/mean_abs_reasoning": 0.4816570580005646, "adv/mean_abs_step_conf": 0.7286078333854675, "adv/ratio_final_to_reasoning": 1.0552003439730984, "adv/ratio_step_to_reasoning": 1.5127107996922855, "adv/std_final_conf": 0.761055588722229, "adv/std_reasoning": 0.7574499249458313, "adv/std_step_conf": 0.9334962964057922, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5770782889426957, "calib/avg_num_step_conf": 5.421875, "calib/ece": 0.2666535433070866, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9409448818897638, "calib/gap": 0.06645755374568973, "calib/mean_conf": 0.9561811023622048, "calib/mu_c": 0.9763276836158192, "calib/mu_w": 0.9098701298701295, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.262992125984252, "calib/std_conf": 0.14413734249370286, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.446548488008342, "calib/step_q_c_n": 959.0, "calib/step_q_gap": 0.05487016633002029, "calib/step_q_w": 0.39167832167832173, "calib/step_q_w_n": 429.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2395.0, "completions/max_terminated_length": 2395.0, "completions/mean_length": 465.83984375, "completions/mean_terminated_length": 465.83984375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.0608, "grad_norm": 0.030439382418990135, "kl": 0.0545654296875, "learning_rate": 3.972222222222223e-06, "loss": 0.0223, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03719177842140198, "mask/share_reasoning": 0.8320133686065674, "mask/share_step_conf": 0.13079488277435303, "num_tokens": 13432294.0, "reward": 0.9639207720756531, "reward_std": 0.19327566027641296, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7231066226959229, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8680161833763123, "step": 57 }, { "adv/mean_abs_final_conf": 0.5480987429618835, "adv/mean_abs_reasoning": 0.4223048686981201, "adv/mean_abs_step_conf": 0.7950807809829712, "adv/ratio_final_to_reasoning": 1.297874553640739, "adv/ratio_step_to_reasoning": 1.8827175339797604, "adv/std_final_conf": 0.7782388925552368, "adv/std_reasoning": 0.6815927624702454, "adv/std_step_conf": 0.933213472366333, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.570188492063492, "calib/avg_num_step_conf": 6.87890625, "calib/ece": 0.3909765625, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.8671875, "calib/gap": 0.0502083333333333, "calib/mean_conf": 0.9107421875000001, "calib/mu_c": 0.9327083333333334, "calib/mu_w": 0.8825000000000001, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.369609375, "calib/std_conf": 0.21828664304696896, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40339342523860017, "calib/step_q_c_n": 943.0, "calib/step_q_gap": 0.09268437878383246, "calib/step_q_w": 0.3107090464547677, "calib/step_q_w_n": 818.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2102.0, "completions/max_terminated_length": 2102.0, "completions/mean_length": 563.21875, "completions/mean_terminated_length": 565.427490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.06186666666666667, "grad_norm": 0.04508507251739502, "kl": 0.046657562255859375, "learning_rate": 3.944444444444445e-06, "loss": 0.0315, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03245016932487488, "mask/share_reasoning": 0.8317633867263794, "mask/share_step_conf": 0.13188019394874573, "num_tokens": 13682798.0, "reward": 0.8925913572311401, "reward_std": 0.17667566239833832, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6096965074539185, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.862986147403717, "step": 58 }, { "adv/mean_abs_final_conf": 0.6050117611885071, "adv/mean_abs_reasoning": 0.5482821464538574, "adv/mean_abs_step_conf": 0.7693145275115967, "adv/ratio_final_to_reasoning": 1.1034679226773325, "adv/ratio_step_to_reasoning": 1.4031362000154806, "adv/std_final_conf": 0.8122844099998474, "adv/std_reasoning": 0.7928110361099243, "adv/std_step_conf": 0.9340978264808655, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5672873045322026, "calib/avg_num_step_conf": 5.9453125, "calib/ece": 0.3519444444444445, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9563492063492064, "calib/gap": 0.052087198515769706, "calib/mean_conf": 0.9630555555555557, "calib/mu_c": 0.9833116883116881, "calib/mu_w": 0.9312244897959184, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3519444444444445, "calib/std_conf": 0.13237373771783348, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44518652226233457, "calib/step_q_c_n": 831.0, "calib/step_q_gap": 0.05318941661834037, "calib/step_q_w": 0.3919971056439942, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3009.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 522.1796875, "completions/mean_terminated_length": 524.2274780273438, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.06293333333333333, "grad_norm": 0.04159040376543999, "kl": 0.051868438720703125, "learning_rate": 3.916666666666667e-06, "loss": 0.0383, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.036025967448949814, "mask/share_reasoning": 0.8390854597091675, "mask/share_step_conf": 0.12098235636949539, "num_tokens": 13922724.0, "reward": 0.8882949352264404, "reward_std": 0.22001904249191284, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6356261372566223, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8237762451171875, "step": 59 }, { "adv/mean_abs_final_conf": 0.5817492008209229, "adv/mean_abs_reasoning": 0.5058983564376831, "adv/mean_abs_step_conf": 0.7513135671615601, "adv/ratio_final_to_reasoning": 1.149932972538888, "adv/ratio_step_to_reasoning": 1.4851077446702623, "adv/std_final_conf": 0.7956362366676331, "adv/std_reasoning": 0.7575187087059021, "adv/std_step_conf": 0.9340049624443054, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6091150006162948, "calib/avg_num_step_conf": 5.66015625, "calib/ece": 0.4308235294117646, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9058823529411765, "calib/gap": 0.05876124738074695, "calib/mean_conf": 0.9383529411764706, "calib/mu_c": 0.9664661654135338, "calib/mu_w": 0.9077049180327869, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42380392156862734, "calib/std_conf": 0.18075124532746079, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4824582701062216, "calib/step_q_c_n": 659.0, "calib/step_q_gap": 0.0953316878277406, "calib/step_q_w": 0.387126582278481, "calib/step_q_w_n": 790.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 467.12890625, "completions/mean_terminated_length": 468.9608154296875, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.064, "grad_norm": 0.044570669531822205, "kl": 0.05500030517578125, "learning_rate": 3.88888888888889e-06, "loss": -0.0669, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03605569154024124, "mask/share_reasoning": 0.8289955258369446, "mask/share_step_conf": 0.13104252517223358, "num_tokens": 14151165.0, "reward": 0.8683550953865051, "reward_std": 0.2017507404088974, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5711711049079895, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8624140620231628, "step": 60 }, { "adv/mean_abs_final_conf": 0.5088446736335754, "adv/mean_abs_reasoning": 0.36581867933273315, "adv/mean_abs_step_conf": 0.7902446389198303, "adv/ratio_final_to_reasoning": 1.3909750988159681, "adv/ratio_step_to_reasoning": 2.1602085502065282, "adv/std_final_conf": 0.7080959677696228, "adv/std_reasoning": 0.618648886680603, "adv/std_step_conf": 0.9336047768592834, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5908109989557953, "calib/avg_num_step_conf": 5.0546875, "calib/ece": 0.3147244094488189, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.968503937007874, "calib/gap": 0.05019422206752544, "calib/mean_conf": 0.9762204724409449, "calib/mu_c": 0.99301775147929, "calib/mu_w": 0.9428235294117645, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3127952755905512, "calib/std_conf": 0.11279504885164993, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5343728018757328, "calib/step_q_c_n": 853.0, "calib/step_q_gap": 0.09378323271473499, "calib/step_q_w": 0.4405895691609978, "calib/step_q_w_n": 441.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2524.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 418.03515625, "completions/mean_terminated_length": 418.03515625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.06506666666666666, "grad_norm": 0.05238793045282364, "kl": 0.06305313110351562, "learning_rate": 3.861111111111112e-06, "loss": -0.0263, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04262950271368027, "mask/share_reasoning": 0.8225347399711609, "mask/share_step_conf": 0.13483577966690063, "num_tokens": 14362246.0, "reward": 0.9320245981216431, "reward_std": 0.17481377720832825, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.6849405765533447, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8486398458480835, "step": 61 }, { "adv/mean_abs_final_conf": 0.627860426902771, "adv/mean_abs_reasoning": 0.5182861089706421, "adv/mean_abs_step_conf": 0.7592391967773438, "adv/ratio_final_to_reasoning": 1.2114166597089633, "adv/ratio_step_to_reasoning": 1.4649036191327873, "adv/std_final_conf": 0.8408026695251465, "adv/std_reasoning": 0.7575954794883728, "adv/std_step_conf": 0.9342003464698792, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6059811827956989, "calib/avg_num_step_conf": 5.703125, "calib/ece": 0.36788844621513944, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.952191235059761, "calib/gap": 0.027135080645161014, "calib/mean_conf": 0.9739442231075698, "calib/mu_c": 0.9843225806451612, "calib/mu_w": 0.9571875000000002, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36215139442231076, "calib/std_conf": 0.111826473768745, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4777091377091377, "calib/step_q_c_n": 777.0, "calib/step_q_gap": 0.10100342760664865, "calib/step_q_w": 0.37670571010248904, "calib/step_q_w_n": 683.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2620.0, "completions/max_terminated_length": 2620.0, "completions/mean_length": 496.6796875, "completions/mean_terminated_length": 498.6274719238281, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.06613333333333334, "grad_norm": 0.05509721860289574, "kl": 0.06145477294921875, "learning_rate": 3.833333333333334e-06, "loss": 0.0135, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03653930127620697, "mask/share_reasoning": 0.8363149166107178, "mask/share_step_conf": 0.12323950231075287, "num_tokens": 14596476.0, "reward": 0.888029932975769, "reward_std": 0.2173173725605011, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6231218576431274, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8365317583084106, "step": 62 }, { "adv/mean_abs_final_conf": 0.5727392435073853, "adv/mean_abs_reasoning": 0.4926705062389374, "adv/mean_abs_step_conf": 0.7514458298683167, "adv/ratio_final_to_reasoning": 1.1625198510048738, "adv/ratio_step_to_reasoning": 1.5252502846270999, "adv/std_final_conf": 0.7952468991279602, "adv/std_reasoning": 0.7574864029884338, "adv/std_step_conf": 0.9340078234672546, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7097506437186611, "calib/avg_num_step_conf": 5.75, "calib/ece": 0.30972111553784853, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8764940239043825, "calib/gap": 0.12637755793467942, "calib/mean_conf": 0.9352191235059761, "calib/mu_c": 0.9825477707006369, "calib/mu_w": 0.8561702127659575, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30972111553784853, "calib/std_conf": 0.17461012858597133, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4136817653890824, "calib/step_q_c_n": 861.0, "calib/step_q_gap": 0.07517112709121004, "calib/step_q_w": 0.33851063829787237, "calib/step_q_w_n": 611.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2637.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 528.4921875, "completions/mean_terminated_length": 532.653564453125, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.0672, "grad_norm": 0.049538128077983856, "kl": 0.06967926025390625, "learning_rate": 3.8055555555555556e-06, "loss": 0.0515, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03559984266757965, "mask/share_reasoning": 0.8368204832077026, "mask/share_step_conf": 0.1197671964764595, "num_tokens": 14840410.0, "reward": 0.9256917834281921, "reward_std": 0.19786253571510315, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6848984360694885, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8477350473403931, "step": 63 }, { "adv/mean_abs_final_conf": 0.5772940516471863, "adv/mean_abs_reasoning": 0.48638617992401123, "adv/mean_abs_step_conf": 0.7385168075561523, "adv/ratio_final_to_reasoning": 1.1869047178465837, "adv/ratio_step_to_reasoning": 1.5183753939545153, "adv/std_final_conf": 0.7961284518241882, "adv/std_reasoning": 0.739449679851532, "adv/std_step_conf": 0.9341706037521362, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6242081447963801, "calib/avg_num_step_conf": 5.6015625, "calib/ece": 0.28991935483870984, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9274193548387096, "calib/gap": 0.06342081447963799, "calib/mean_conf": 0.9569354838709677, "calib/mu_c": 0.9768823529411764, "calib/mu_w": 0.9134615384615384, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2806854838709679, "calib/std_conf": 0.15405632306523084, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4110068649885584, "calib/step_q_c_n": 874.0, "calib/step_q_gap": 0.10820329355998692, "calib/step_q_w": 0.30280357142857145, "calib/step_q_w_n": 560.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2770.0, "completions/max_terminated_length": 2770.0, "completions/mean_length": 465.09375, "completions/mean_terminated_length": 470.60870361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.06826666666666667, "grad_norm": 0.05453026294708252, "kl": 0.0635986328125, "learning_rate": 3.777777777777778e-06, "loss": -0.0028, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0382472425699234, "mask/share_reasoning": 0.8235123157501221, "mask/share_step_conf": 0.12652164697647095, "num_tokens": 15063250.0, "reward": 0.9168107509613037, "reward_std": 0.2164766490459442, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6919308304786682, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8159095048904419, "step": 64 }, { "adv/mean_abs_final_conf": 0.4365313649177551, "adv/mean_abs_reasoning": 0.26889199018478394, "adv/mean_abs_step_conf": 0.7722085118293762, "adv/ratio_final_to_reasoning": 1.623445029425267, "adv/ratio_step_to_reasoning": 2.8718167145801203, "adv/std_final_conf": 0.6810697913169861, "adv/std_reasoning": 0.5482594966888428, "adv/std_step_conf": 0.9330787658691406, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7023010546500479, "calib/avg_num_step_conf": 5.09765625, "calib/ece": 0.4046062992125985, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.968503937007874, "calib/gap": 0.013262384148290285, "calib/mean_conf": 0.986732283464567, "calib/mu_c": 0.9922147651006712, "calib/mu_w": 0.9789523809523809, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40236220472440953, "calib/std_conf": 0.06243067951880301, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44182284980744546, "calib/step_q_c_n": 779.0, "calib/step_q_gap": 0.0518988954348219, "calib/step_q_w": 0.38992395437262356, "calib/step_q_w_n": 526.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 401.34375, "completions/mean_terminated_length": 401.34375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.06933333333333333, "grad_norm": 0.054482534527778625, "kl": 0.072052001953125, "learning_rate": 3.7500000000000005e-06, "loss": 0.0635, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04360418766736984, "mask/share_reasoning": 0.8223955631256104, "mask/share_step_conf": 0.13400021195411682, "num_tokens": 15271018.0, "reward": 0.8871078491210938, "reward_std": 0.11454544961452484, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.5952550768852234, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8633356690406799, "step": 65 }, { "adv/mean_abs_final_conf": 0.5894931554794312, "adv/mean_abs_reasoning": 0.4546332061290741, "adv/mean_abs_step_conf": 0.746368944644928, "adv/ratio_final_to_reasoning": 1.2966346222235012, "adv/ratio_step_to_reasoning": 1.641694743329038, "adv/std_final_conf": 0.7794395685195923, "adv/std_reasoning": 0.7014214396476746, "adv/std_step_conf": 0.9329808354377747, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7335336839469071, "calib/avg_num_step_conf": 5.85546875, "calib/ece": 0.39201581027667987, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8695652173913043, "calib/gap": 0.13963498622589532, "calib/mean_conf": 0.9099604743083005, "calib/mu_c": 0.9767424242424243, "calib/mu_w": 0.837107438016529, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39011857707509884, "calib/std_conf": 0.24128198126911699, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4097321428571429, "calib/step_q_c_n": 672.0, "calib/step_q_gap": 0.13024000259112112, "calib/step_q_w": 0.27949214026602176, "calib/step_q_w_n": 827.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2143.0, "completions/max_terminated_length": 2143.0, "completions/mean_length": 512.421875, "completions/mean_terminated_length": 516.4566650390625, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.0704, "grad_norm": 0.02887682057917118, "kl": 0.05440521240234375, "learning_rate": 3.7222222222222225e-06, "loss": -0.0044, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.035790372639894485, "mask/share_reasoning": 0.8343870639801025, "mask/share_step_conf": 0.12201003730297089, "num_tokens": 15508550.0, "reward": 0.883568525314331, "reward_std": 0.18583741784095764, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6040624380111694, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8622932434082031, "step": 66 }, { "adv/mean_abs_final_conf": 0.46097415685653687, "adv/mean_abs_reasoning": 0.3053887188434601, "adv/mean_abs_step_conf": 0.7570107579231262, "adv/ratio_final_to_reasoning": 1.5094668807750842, "adv/ratio_step_to_reasoning": 2.4788432290164724, "adv/std_final_conf": 0.690024197101593, "adv/std_reasoning": 0.5959534645080566, "adv/std_step_conf": 0.9334073662757874, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7326582789477225, "calib/avg_num_step_conf": 5.36328125, "calib/ece": 0.28363281249999994, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.8671875, "calib/gap": 0.1248556818946377, "calib/mean_conf": 0.9342578125000001, "calib/mu_c": 0.9776646706586826, "calib/mu_w": 0.8528089887640449, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28277343749999995, "calib/std_conf": 0.16308619011956482, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39914218566392473, "calib/step_q_c_n": 851.0, "calib/step_q_gap": 0.09987015501258373, "calib/step_q_w": 0.299272030651341, "calib/step_q_w_n": 522.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 485.20703125, "completions/mean_terminated_length": 487.1098327636719, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.07146666666666666, "grad_norm": 0.038448333740234375, "kl": 0.05216217041015625, "learning_rate": 3.694444444444445e-06, "loss": -0.0571, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.035844773054122925, "mask/share_reasoning": 0.8427526950836182, "mask/share_step_conf": 0.11749625205993652, "num_tokens": 15737771.0, "reward": 0.9710288047790527, "reward_std": 0.11427510529756546, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7237683534622192, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8878204822540283, "step": 67 }, { "adv/mean_abs_final_conf": 0.5147673487663269, "adv/mean_abs_reasoning": 0.41767218708992004, "adv/mean_abs_step_conf": 0.7670242786407471, "adv/ratio_final_to_reasoning": 1.232467386332103, "adv/ratio_step_to_reasoning": 1.8364265142596519, "adv/std_final_conf": 0.7798352837562561, "adv/std_reasoning": 0.7012434601783752, "adv/std_step_conf": 0.9333664774894714, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6275355637513171, "calib/avg_num_step_conf": 5.59375, "calib/ece": 0.3505600000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.896, "calib/gap": 0.11449947312961006, "calib/mean_conf": 0.9345600000000001, "calib/mu_c": 0.9821917808219178, "calib/mu_w": 0.8676923076923078, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3505600000000001, "calib/std_conf": 0.17880941362243766, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4176557863501484, "calib/step_q_c_n": 674.0, "calib/step_q_gap": 0.1449249156377473, "calib/step_q_w": 0.2727308707124011, "calib/step_q_w_n": 758.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2741.0, "completions/max_terminated_length": 2741.0, "completions/mean_length": 482.67578125, "completions/mean_terminated_length": 484.56866455078125, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.07253333333333334, "grad_norm": 0.30053746700286865, "kl": 0.09393692016601562, "learning_rate": 3.6666666666666666e-06, "loss": 0.0184, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03930802643299103, "mask/share_reasoning": 0.8303000926971436, "mask/share_step_conf": 0.12648558616638184, "num_tokens": 15965424.0, "reward": 0.8982089757919312, "reward_std": 0.16178368031978607, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6389793157577515, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8488449454307556, "step": 68 }, { "adv/mean_abs_final_conf": 0.6562919616699219, "adv/mean_abs_reasoning": 0.4475212097167969, "adv/mean_abs_step_conf": 0.7461894750595093, "adv/ratio_final_to_reasoning": 1.4665047095426842, "adv/ratio_step_to_reasoning": 1.6673834867663981, "adv/std_final_conf": 0.8225091695785522, "adv/std_reasoning": 0.7013913989067078, "adv/std_step_conf": 0.934605598449707, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6312986350299783, "calib/avg_num_step_conf": 5.828125, "calib/ece": 0.3744223107569721, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7569721115537849, "calib/gap": 0.05847620870008918, "calib/mean_conf": 0.8808764940239044, "calib/mu_c": 0.908134328358209, "calib/mu_w": 0.8496581196581198, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36071713147410356, "calib/std_conf": 0.2134931180143776, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4194698085419736, "calib/step_q_c_n": 679.0, "calib/step_q_gap": 0.09960510989498711, "calib/step_q_w": 0.31986469864698647, "calib/step_q_w_n": 813.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 591.515625, "completions/mean_terminated_length": 596.1732177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.0736, "grad_norm": 0.08974709361791611, "kl": 0.2734527587890625, "learning_rate": 3.638888888888889e-06, "loss": -0.0409, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.031212469562888145, "mask/share_reasoning": 0.8525819778442383, "mask/share_step_conf": 0.10839303582906723, "num_tokens": 16221348.0, "reward": 0.8734649419784546, "reward_std": 0.18273797631263733, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6022570133209229, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8438915014266968, "step": 69 }, { "adv/mean_abs_final_conf": 0.6758154034614563, "adv/mean_abs_reasoning": 0.5166471004486084, "adv/mean_abs_step_conf": 0.737113356590271, "adv/ratio_final_to_reasoning": 1.308079350246311, "adv/ratio_step_to_reasoning": 1.4267250429746536, "adv/std_final_conf": 0.8379309773445129, "adv/std_reasoning": 0.7752685546875, "adv/std_step_conf": 0.9346587061882019, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7898451730418943, "calib/avg_num_step_conf": 5.84375, "calib/ece": 0.30862903225806443, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6814516129032258, "calib/gap": 0.262110330470986, "calib/mean_conf": 0.8166935483870968, "calib/mu_c": 0.9456349206349206, "calib/mu_w": 0.6835245901639346, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30862903225806443, "calib/std_conf": 0.28888333892207035, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45647058823529413, "calib/step_q_c_n": 629.0, "calib/step_q_gap": 0.18273356401384078, "calib/step_q_w": 0.27373702422145335, "calib/step_q_w_n": 867.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2707.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 525.234375, "completions/mean_terminated_length": 540.0, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.07466666666666667, "grad_norm": 0.04197907820343971, "kl": 0.05033111572265625, "learning_rate": 3.6111111111111115e-06, "loss": -0.1658, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0343087837100029, "mask/share_reasoning": 0.8175104260444641, "mask/share_step_conf": 0.12083704024553299, "num_tokens": 16462800.0, "reward": 0.9065558910369873, "reward_std": 0.20242035388946533, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6768535375595093, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8448520302772522, "step": 70 }, { "adv/mean_abs_final_conf": 0.6989947557449341, "adv/mean_abs_reasoning": 0.5246307253837585, "adv/mean_abs_step_conf": 0.737536609172821, "adv/ratio_final_to_reasoning": 1.3323557350432176, "adv/ratio_step_to_reasoning": 1.405820462828069, "adv/std_final_conf": 0.8714163899421692, "adv/std_reasoning": 0.7754472494125366, "adv/std_step_conf": 0.9344663619995117, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6917724765935616, "calib/avg_num_step_conf": 5.609375, "calib/ece": 0.2707171314741035, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.549800796812749, "calib/gap": 0.16669039374118255, "calib/mean_conf": 0.7711155378486056, "calib/mu_c": 0.8461594202898551, "calib/mu_w": 0.6794690265486726, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24601593625497997, "calib/std_conf": 0.28485886384028153, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4204925373134329, "calib/step_q_c_n": 670.0, "calib/step_q_gap": 0.08605389501578276, "calib/step_q_w": 0.33443864229765013, "calib/step_q_w_n": 766.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2827.0, "completions/max_terminated_length": 2827.0, "completions/mean_length": 522.87109375, "completions/mean_terminated_length": 522.87109375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.07573333333333333, "grad_norm": 0.030360069125890732, "kl": 0.054576873779296875, "learning_rate": 3.5833333333333335e-06, "loss": 0.0097, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03734172135591507, "mask/share_reasoning": 0.8406904935836792, "mask/share_step_conf": 0.12196780741214752, "num_tokens": 16701063.0, "reward": 0.9102877378463745, "reward_std": 0.18750911951065063, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6886539459228516, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8287965059280396, "step": 71 }, { "adv/mean_abs_final_conf": 0.7164885997772217, "adv/mean_abs_reasoning": 0.4583200514316559, "adv/mean_abs_step_conf": 0.767371654510498, "adv/ratio_final_to_reasoning": 1.5632931562542896, "adv/ratio_step_to_reasoning": 1.6743139474554007, "adv/std_final_conf": 0.9001376032829285, "adv/std_reasoning": 0.7392032146453857, "adv/std_step_conf": 0.9336949586868286, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7175264713124847, "calib/avg_num_step_conf": 4.890625, "calib/ece": 0.2699084967320261, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5137254901960784, "calib/gap": 0.188356521382254, "calib/mean_conf": 0.7594248366013073, "calib/mu_c": 0.8510178117048347, "calib/mu_w": 0.6626612903225807, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2578039215686274, "calib/std_conf": 0.28468599893357543, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44402117834394894, "calib/step_q_c_n": 628.0, "calib/step_q_gap": 0.07713015270292328, "calib/step_q_w": 0.36689102564102566, "calib/step_q_w_n": 624.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1704.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 468.11328125, "completions/mean_terminated_length": 468.11328125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.0768, "grad_norm": 0.05988554283976555, "kl": 0.3415679931640625, "learning_rate": 3.555555555555556e-06, "loss": -0.0079, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03583850711584091, "mask/share_reasoning": 0.8487659692764282, "mask/share_step_conf": 0.11539548635482788, "num_tokens": 16925308.0, "reward": 0.9356938600540161, "reward_std": 0.16360831260681152, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7001357674598694, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8696893453598022, "step": 72 }, { "adv/mean_abs_final_conf": 0.7666511535644531, "adv/mean_abs_reasoning": 0.614189624786377, "adv/mean_abs_step_conf": 0.7640584707260132, "adv/ratio_final_to_reasoning": 1.2482320160180242, "adv/ratio_step_to_reasoning": 1.2440107092199133, "adv/std_final_conf": 0.9226399660110474, "adv/std_reasoning": 0.8266867399215698, "adv/std_step_conf": 0.9349403977394104, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7521962937542895, "calib/avg_num_step_conf": 5.10546875, "calib/ece": 0.1332530120481928, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.42168674698795183, "calib/gap": 0.2640789293067949, "calib/mean_conf": 0.7317269076305221, "calib/mu_c": 0.8314193548387097, "calib/mu_w": 0.5673404255319148, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.12124497991967871, "calib/std_conf": 0.2826875993640548, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4355613422818792, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.06387095082280447, "calib/step_q_w": 0.37169039145907473, "calib/step_q_w_n": 562.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3070.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 489.25, "completions/mean_terminated_length": 491.1686706542969, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.07786666666666667, "grad_norm": 0.044797178357839584, "kl": 0.0715484619140625, "learning_rate": 3.5277777777777784e-06, "loss": -0.0412, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03444898873567581, "mask/share_reasoning": 0.8487522006034851, "mask/share_step_conf": 0.11289255321025848, "num_tokens": 17157588.0, "reward": 0.9419361352920532, "reward_std": 0.21368886530399323, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7651835680007935, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8054074048995972, "step": 73 }, { "adv/mean_abs_final_conf": 0.7724243998527527, "adv/mean_abs_reasoning": 0.599509596824646, "adv/mean_abs_step_conf": 0.7545539736747742, "adv/ratio_final_to_reasoning": 1.288427080974124, "adv/ratio_step_to_reasoning": 1.258618673781594, "adv/std_final_conf": 0.9349808692932129, "adv/std_reasoning": 0.8099501132965088, "adv/std_step_conf": 0.9347738027572632, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7580335254753859, "calib/avg_num_step_conf": 4.90234375, "calib/ece": 0.10906504065040648, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.2073170731707317, "calib/gap": 0.28326972768833225, "calib/mean_conf": 0.5492276422764228, "calib/mu_c": 0.6839534883720929, "calib/mu_w": 0.40068376068376066, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.0669512195121951, "calib/std_conf": 0.31374704515197527, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4218228279386712, "calib/step_q_c_n": 587.0, "calib/step_q_gap": 0.10288570219016818, "calib/step_q_w": 0.318937125748503, "calib/step_q_w_n": 668.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2676.0, "completions/max_terminated_length": 2676.0, "completions/mean_length": 480.95703125, "completions/mean_terminated_length": 488.59130859375, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.07893333333333333, "grad_norm": 0.03414783999323845, "kl": 0.08736419677734375, "learning_rate": 3.5e-06, "loss": -0.0599, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03647315502166748, "mask/share_reasoning": 0.8328334093093872, "mask/share_step_conf": 0.11506839096546173, "num_tokens": 17384641.0, "reward": 0.9393357634544373, "reward_std": 0.17300420999526978, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7562761306762695, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8309890031814575, "step": 74 }, { "adv/mean_abs_final_conf": 0.7449522614479065, "adv/mean_abs_reasoning": 0.4977647066116333, "adv/mean_abs_step_conf": 0.7689626812934875, "adv/ratio_final_to_reasoning": 1.4965951815244591, "adv/ratio_step_to_reasoning": 1.5448316666079918, "adv/std_final_conf": 0.9269348978996277, "adv/std_reasoning": 0.7752844095230103, "adv/std_step_conf": 0.934822142124176, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7669344675488343, "calib/avg_num_step_conf": 4.8828125, "calib/ece": 0.10792094861660081, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.40711462450592883, "calib/gap": 0.30033043478260874, "calib/mean_conf": 0.6924743083003954, "calib/mu_c": 0.7743826086956522, "calib/mu_w": 0.4740521739130435, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.03656126482213439, "calib/std_conf": 0.3128283584172879, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.43616800920598386, "calib/step_q_c_n": 869.0, "calib/step_q_gap": 0.08076118505900226, "calib/step_q_w": 0.3554068241469816, "calib/step_q_w_n": 381.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3023.0, "completions/max_terminated_length": 3023.0, "completions/mean_length": 431.88671875, "completions/mean_terminated_length": 433.5804138183594, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.08, "grad_norm": 0.08078334480524063, "kl": 0.092681884765625, "learning_rate": 3.4722222222222224e-06, "loss": -0.0185, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.038396649062633514, "mask/share_reasoning": 0.8343999981880188, "mask/share_step_conf": 0.12329712510108948, "num_tokens": 17599956.0, "reward": 0.9955003261566162, "reward_std": 0.16879957914352417, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.809590220451355, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8407853841781616, "step": 75 }, { "adv/mean_abs_final_conf": 0.7436815500259399, "adv/mean_abs_reasoning": 0.5508506298065186, "adv/mean_abs_step_conf": 0.7598429918289185, "adv/ratio_final_to_reasoning": 1.3500602700357291, "adv/ratio_step_to_reasoning": 1.3793993338916697, "adv/std_final_conf": 0.9284547567367554, "adv/std_reasoning": 0.8097205758094788, "adv/std_step_conf": 0.934494137763977, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.704641812865497, "calib/avg_num_step_conf": 4.2890625, "calib/ece": 0.16167729083665333, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3665338645418327, "calib/gap": 0.22810891812865497, "calib/mean_conf": 0.6665298804780877, "calib/mu_c": 0.739233918128655, "calib/mu_w": 0.511125, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.07346613545816727, "calib/std_conf": 0.31771581758015005, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4496159122085048, "calib/step_q_c_n": 729.0, "calib/step_q_gap": 0.06660778212720392, "calib/step_q_w": 0.38300813008130086, "calib/step_q_w_n": 369.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2252.0, "completions/max_terminated_length": 2252.0, "completions/mean_length": 469.73828125, "completions/mean_terminated_length": 473.43701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.08106666666666666, "grad_norm": 0.03808128088712692, "kl": 0.091583251953125, "learning_rate": 3.444444444444445e-06, "loss": -0.0494, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03749001398682594, "mask/share_reasoning": 0.8472702503204346, "mask/share_step_conf": 0.10742717236280441, "num_tokens": 17823265.0, "reward": 0.9695121049880981, "reward_std": 0.16438232362270355, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7655134201049805, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8438230752944946, "step": 76 }, { "adv/mean_abs_final_conf": 0.7754688858985901, "adv/mean_abs_reasoning": 0.46595898270606995, "adv/mean_abs_step_conf": 0.7579265236854553, "adv/ratio_final_to_reasoning": 1.66424280822967, "adv/ratio_step_to_reasoning": 1.626594940361007, "adv/std_final_conf": 0.9346413612365723, "adv/std_reasoning": 0.7393398284912109, "adv/std_step_conf": 0.9346475005149841, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6738372919926041, "calib/avg_num_step_conf": 5.04296875, "calib/ece": 0.15510121457489887, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3441295546558704, "calib/gap": 0.1813938273360829, "calib/mean_conf": 0.6770445344129554, "calib/mu_c": 0.7424050632911392, "calib/mu_w": 0.5610112359550563, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09623481781376528, "calib/std_conf": 0.2917498686733845, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.42880299319727894, "calib/step_q_c_n": 735.0, "calib/step_q_gap": 0.09964831693828613, "calib/step_q_w": 0.3291546762589928, "calib/step_q_w_n": 556.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2741.0, "completions/max_terminated_length": 2741.0, "completions/mean_length": 484.9921875, "completions/mean_terminated_length": 490.74310302734375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.08213333333333334, "grad_norm": 0.054215505719184875, "kl": 0.089569091796875, "learning_rate": 3.416666666666667e-06, "loss": -0.054, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.040008001029491425, "mask/share_reasoning": 0.8277705907821655, "mask/share_step_conf": 0.12050262093544006, "num_tokens": 18052087.0, "reward": 0.9375009536743164, "reward_std": 0.18988294899463654, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7365831732749939, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8227936029434204, "step": 77 }, { "adv/mean_abs_final_conf": 0.7341655492782593, "adv/mean_abs_reasoning": 0.5082210898399353, "adv/mean_abs_step_conf": 0.728459358215332, "adv/ratio_final_to_reasoning": 1.4445790699269985, "adv/ratio_step_to_reasoning": 1.4333512968632627, "adv/std_final_conf": 0.9238205552101135, "adv/std_reasoning": 0.7927084565162659, "adv/std_step_conf": 0.9343314170837402, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7181290024427279, "calib/avg_num_step_conf": 4.9375, "calib/ece": 0.21872089947089954, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5674603174603174, "calib/gap": 0.21723846306199257, "calib/mean_conf": 0.7611203703703704, "calib/mu_c": 0.8464640522875818, "calib/mu_w": 0.6292255892255892, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.18634920634920643, "calib/std_conf": 0.31027963019646326, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.44736795212765956, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.05672107712765956, "calib/step_q_w": 0.390646875, "calib/step_q_w_n": 512.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 497.40234375, "completions/mean_terminated_length": 501.31890869140625, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.0832, "grad_norm": 0.03283295780420303, "kl": 0.0950469970703125, "learning_rate": 3.3888888888888893e-06, "loss": -0.0921, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033064231276512146, "mask/share_reasoning": 0.8525799512863159, "mask/share_step_conf": 0.10654333233833313, "num_tokens": 18287446.0, "reward": 0.945235013961792, "reward_std": 0.19811129570007324, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7295798063278198, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8444838523864746, "step": 78 }, { "adv/mean_abs_final_conf": 0.6889576315879822, "adv/mean_abs_reasoning": 0.46892887353897095, "adv/mean_abs_step_conf": 0.7485886812210083, "adv/ratio_final_to_reasoning": 1.469215632614965, "adv/ratio_step_to_reasoning": 1.5963800129675654, "adv/std_final_conf": 0.88222736120224, "adv/std_reasoning": 0.7206056118011475, "adv/std_step_conf": 0.9345911145210266, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6310708598726115, "calib/avg_num_step_conf": 5.09765625, "calib/ece": 0.3037146245059287, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6837944664031621, "calib/gap": 0.09512561040339707, "calib/mean_conf": 0.8567596837944664, "calib/mu_c": 0.8928547770700637, "calib/mu_w": 0.7977291666666666, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2699604743083003, "calib/std_conf": 0.24993430622062862, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4742439644218552, "calib/step_q_c_n": 787.0, "calib/step_q_gap": 0.06760303778092852, "calib/step_q_w": 0.40664092664092666, "calib/step_q_w_n": 518.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2703.0, "completions/max_terminated_length": 2703.0, "completions/mean_length": 509.5078125, "completions/mean_terminated_length": 513.5196533203125, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.08426666666666667, "grad_norm": 0.0327574759721756, "kl": 0.08077239990234375, "learning_rate": 3.3611111111111117e-06, "loss": -0.0964, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03413655236363411, "mask/share_reasoning": 0.8568128943443298, "mask/share_step_conf": 0.10123801231384277, "num_tokens": 18524256.0, "reward": 0.9247376918792725, "reward_std": 0.19518733024597168, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6796905994415283, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8502533435821533, "step": 79 }, { "adv/mean_abs_final_conf": 0.6469038128852844, "adv/mean_abs_reasoning": 0.5513267517089844, "adv/mean_abs_step_conf": 0.7641560435295105, "adv/ratio_final_to_reasoning": 1.1733582868598946, "adv/ratio_step_to_reasoning": 1.386031135185813, "adv/std_final_conf": 0.8788886666297913, "adv/std_reasoning": 0.7927740216255188, "adv/std_step_conf": 0.9344629049301147, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6631141618497111, "calib/avg_num_step_conf": 5.50390625, "calib/ece": 0.2358517786561264, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8458498023715415, "calib/gap": 0.19351343208092486, "calib/mean_conf": 0.9094446640316206, "calib/mu_c": 0.970634682080925, "calib/mu_w": 0.7771212500000001, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2307509881422924, "calib/std_conf": 0.23273399517701132, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4987823275862069, "calib/step_q_c_n": 928.0, "calib/step_q_gap": 0.07834511344899275, "calib/step_q_w": 0.42043721413721413, "calib/step_q_w_n": 481.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 475.51171875, "completions/mean_terminated_length": 475.51171875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.08533333333333333, "grad_norm": 0.025196747854351997, "kl": 0.09796142578125, "learning_rate": 3.3333333333333333e-06, "loss": -0.0351, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.037030771374702454, "mask/share_reasoning": 0.8361947536468506, "mask/share_step_conf": 0.12677444517612457, "num_tokens": 18748147.0, "reward": 0.960274875164032, "reward_std": 0.21059617400169373, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7495396733283997, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8389787673950195, "step": 80 }, { "adv/mean_abs_final_conf": 0.6059747934341431, "adv/mean_abs_reasoning": 0.4931219816207886, "adv/mean_abs_step_conf": 0.7429808378219604, "adv/ratio_final_to_reasoning": 1.228853744143449, "adv/ratio_step_to_reasoning": 1.5066877274055765, "adv/std_final_conf": 0.8232892751693726, "adv/std_reasoning": 0.7575910687446594, "adv/std_step_conf": 0.934981644153595, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7076734811402247, "calib/avg_num_step_conf": 5.24609375, "calib/ece": 0.2738987654320988, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.7901234567901234, "calib/gap": 0.2289561762165273, "calib/mean_conf": 0.8807514403292181, "calib/mu_c": 0.9674344370860927, "calib/mu_w": 0.7384782608695654, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.2666255144032922, "calib/std_conf": 0.2550425248734223, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5346244477172312, "calib/step_q_c_n": 679.0, "calib/step_q_gap": 0.15555818265699023, "calib/step_q_w": 0.37906626506024094, "calib/step_q_w_n": 664.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3014.0, "completions/max_terminated_length": 3014.0, "completions/mean_length": 503.9296875, "completions/mean_terminated_length": 513.9681396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.0864, "grad_norm": 0.023068297654390335, "kl": 0.0867462158203125, "learning_rate": 3.3055555555555558e-06, "loss": -0.117, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.0342206209897995, "mask/share_reasoning": 0.8357434868812561, "mask/share_step_conf": 0.11050460487604141, "num_tokens": 18983401.0, "reward": 0.9053604602813721, "reward_std": 0.22578772902488708, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6947583556175232, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8089312314987183, "step": 81 }, { "adv/mean_abs_final_conf": 0.614736020565033, "adv/mean_abs_reasoning": 0.5235514044761658, "adv/mean_abs_step_conf": 0.7543563842773438, "adv/ratio_final_to_reasoning": 1.174165545750185, "adv/ratio_step_to_reasoning": 1.4408449253079698, "adv/std_final_conf": 0.8214467763900757, "adv/std_reasoning": 0.7752867937088013, "adv/std_step_conf": 0.9348330497741699, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6453923593185338, "calib/avg_num_step_conf": 4.890625, "calib/ece": 0.3333201581027669, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.83399209486166, "calib/gap": 0.1542010841507485, "calib/mean_conf": 0.9102371541501976, "calib/mu_c": 0.9736241610738254, "calib/mu_w": 0.8194230769230769, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.32731225296442695, "calib/std_conf": 0.2233431377872269, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5443872919818458, "calib/step_q_c_n": 661.0, "calib/step_q_gap": 0.1393788317449592, "calib/step_q_w": 0.4050084602368866, "calib/step_q_w_n": 591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2514.0, "completions/max_terminated_length": 2514.0, "completions/mean_length": 448.171875, "completions/mean_terminated_length": 449.929443359375, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.08746666666666666, "grad_norm": 0.048648640513420105, "kl": 0.0974578857421875, "learning_rate": 3.277777777777778e-06, "loss": -0.0345, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03892158344388008, "mask/share_reasoning": 0.8418777585029602, "mask/share_step_conf": 0.11529439687728882, "num_tokens": 19203685.0, "reward": 0.9137436747550964, "reward_std": 0.21004939079284668, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6714894771575928, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8419353365898132, "step": 82 }, { "adv/mean_abs_final_conf": 0.6282051205635071, "adv/mean_abs_reasoning": 0.48869287967681885, "adv/mean_abs_step_conf": 0.7447449564933777, "adv/ratio_final_to_reasoning": 1.2854804043368713, "adv/ratio_step_to_reasoning": 1.5239529517718584, "adv/std_final_conf": 0.8340749144554138, "adv/std_reasoning": 0.7394261360168457, "adv/std_step_conf": 0.9344735145568848, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6325115562403698, "calib/avg_num_step_conf": 4.62109375, "calib/ece": 0.36741999999999986, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.772, "calib/gap": 0.1394003595274782, "calib/mean_conf": 0.8589, "calib/mu_c": 0.9246969696969697, "calib/mu_w": 0.7852966101694915, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.34915999999999986, "calib/std_conf": 0.2897531535635117, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5318982387475538, "calib/step_q_c_n": 511.0, "calib/step_q_gap": 0.11579704827136333, "calib/step_q_w": 0.4161011904761905, "calib/step_q_w_n": 672.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 510.96484375, "completions/mean_terminated_length": 514.9881591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.08853333333333334, "grad_norm": 0.033095818012952805, "kl": 0.09009552001953125, "learning_rate": 3.2500000000000002e-06, "loss": -0.1238, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03497224301099777, "mask/share_reasoning": 0.8547062277793884, "mask/share_step_conf": 0.1025090143084526, "num_tokens": 19441756.0, "reward": 0.8709297180175781, "reward_std": 0.22499291598796844, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6079858541488647, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8369985818862915, "step": 83 }, { "adv/mean_abs_final_conf": 0.7021284699440002, "adv/mean_abs_reasoning": 0.5409032702445984, "adv/mean_abs_step_conf": 0.7552073001861572, "adv/ratio_final_to_reasoning": 1.298066601864868, "adv/ratio_step_to_reasoning": 1.3961965876905307, "adv/std_final_conf": 0.8890236020088196, "adv/std_reasoning": 0.7927778363227844, "adv/std_step_conf": 0.934401273727417, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7623575361762068, "calib/avg_num_step_conf": 4.4140625, "calib/ece": 0.3165027888446216, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.7370517928286853, "calib/gap": 0.24678008707901156, "calib/mean_conf": 0.8535370517928287, "calib/mu_c": 0.9656204379562046, "calib/mu_w": 0.718840350877193, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.31211155378486066, "calib/std_conf": 0.27613141860409884, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5193537964458805, "calib/step_q_c_n": 619.0, "calib/step_q_gap": 0.08021485319734822, "calib/step_q_w": 0.4391389432485323, "calib/step_q_w_n": 511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2868.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 447.01171875, "completions/mean_terminated_length": 447.01171875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.0896, "grad_norm": 0.04072084650397301, "kl": 0.0982513427734375, "learning_rate": 3.2222222222222227e-06, "loss": 0.002, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03983251377940178, "mask/share_reasoning": 0.8514313697814941, "mask/share_step_conf": 0.10873612016439438, "num_tokens": 19662111.0, "reward": 0.9126341342926025, "reward_std": 0.22445017099380493, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6862176060676575, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8367067575454712, "step": 84 }, { "adv/mean_abs_final_conf": 0.6004186868667603, "adv/mean_abs_reasoning": 0.457645446062088, "adv/mean_abs_step_conf": 0.7367417812347412, "adv/ratio_final_to_reasoning": 1.3119734764831517, "adv/ratio_step_to_reasoning": 1.6098527529863995, "adv/std_final_conf": 0.8238394856452942, "adv/std_reasoning": 0.7392846941947937, "adv/std_step_conf": 0.9346275329589844, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7251965923984273, "calib/avg_num_step_conf": 5.1796875, "calib/ece": 0.28236947791164657, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.7028112449799196, "calib/gap": 0.26753211009174327, "calib/mean_conf": 0.8218875502008033, "calib/mu_c": 0.9389999999999998, "calib/mu_w": 0.6714678899082566, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.27100401606425706, "calib/std_conf": 0.300692726664778, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.45204316546762585, "calib/step_q_c_n": 695.0, "calib/step_q_gap": 0.08286725421564489, "calib/step_q_w": 0.36917591125198096, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2873.0, "completions/max_terminated_length": 2873.0, "completions/mean_length": 530.91015625, "completions/mean_terminated_length": 537.20556640625, "completions/min_length": 0.0, "completions/min_terminated_length": 99.0, "epoch": 0.09066666666666667, "grad_norm": 0.029364172369241714, "kl": 0.079315185546875, "learning_rate": 3.1944444444444443e-06, "loss": -0.1209, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.035193197429180145, "mask/share_reasoning": 0.8402204513549805, "mask/share_step_conf": 0.11286762356758118, "num_tokens": 19905848.0, "reward": 0.910934329032898, "reward_std": 0.23398302495479584, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6924851536750793, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8309459686279297, "step": 85 }, { "adv/mean_abs_final_conf": 0.7090624570846558, "adv/mean_abs_reasoning": 0.5809318423271179, "adv/mean_abs_step_conf": 0.7425640821456909, "adv/ratio_final_to_reasoning": 1.2205604950905558, "adv/ratio_step_to_reasoning": 1.2782292655384506, "adv/std_final_conf": 0.9075257778167725, "adv/std_reasoning": 0.8099024891853333, "adv/std_step_conf": 0.9343748092651367, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.773311491935484, "calib/avg_num_step_conf": 4.78515625, "calib/ece": 0.21140476190476187, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5158730158730159, "calib/gap": 0.3180498991935483, "calib/mean_conf": 0.715452380952381, "calib/mu_c": 0.871953125, "calib/mu_w": 0.5539032258064517, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.20946031746031743, "calib/std_conf": 0.3358017202926065, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.45, "calib/step_q_c_n": 604.0, "calib/step_q_gap": 0.07099339774557167, "calib/step_q_w": 0.37900660225442834, "calib/step_q_w_n": 621.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1889.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 506.765625, "completions/mean_terminated_length": 508.7529602050781, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.09173333333333333, "grad_norm": 0.027637803927063942, "kl": 0.08685302734375, "learning_rate": 3.1666666666666667e-06, "loss": -0.1019, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035529859364032745, "mask/share_reasoning": 0.8527402877807617, "mask/share_step_conf": 0.10782356560230255, "num_tokens": 20141092.0, "reward": 0.9500056505203247, "reward_std": 0.20182755589485168, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.7396859526634216, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8642314672470093, "step": 86 }, { "adv/mean_abs_final_conf": 0.632361888885498, "adv/mean_abs_reasoning": 0.47548264265060425, "adv/mean_abs_step_conf": 0.757982611656189, "adv/ratio_final_to_reasoning": 1.3299368518698427, "adv/ratio_step_to_reasoning": 1.5941330842925685, "adv/std_final_conf": 0.8701613545417786, "adv/std_reasoning": 0.7393690347671509, "adv/std_step_conf": 0.9344350099563599, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7125527426160339, "calib/avg_num_step_conf": 4.5703125, "calib/ece": 0.18223346828609988, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6639676113360324, "calib/gap": 0.22428985834840287, "calib/mean_conf": 0.8303171390013495, "calib/mu_c": 0.9020535714285716, "calib/mu_w": 0.6777637130801687, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16619433198380568, "calib/std_conf": 0.274614031204716, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.493969571230982, "calib/step_q_c_n": 723.0, "calib/step_q_gap": 0.10150872111912518, "calib/step_q_w": 0.39246085011185683, "calib/step_q_w_n": 447.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3028.0, "completions/max_terminated_length": 3028.0, "completions/mean_length": 418.453125, "completions/mean_terminated_length": 426.7888488769531, "completions/min_length": 0.0, "completions/min_terminated_length": 96.0, "epoch": 0.0928, "grad_norm": 0.029977787286043167, "kl": 0.09900665283203125, "learning_rate": 3.138888888888889e-06, "loss": -0.0592, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.04178471863269806, "mask/share_reasoning": 0.8263924717903137, "mask/share_step_conf": 0.11229157447814941, "num_tokens": 20353712.0, "reward": 0.9529500603675842, "reward_std": 0.19986681640148163, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7545884847640991, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8263115286827087, "step": 87 }, { "adv/mean_abs_final_conf": 0.6885051727294922, "adv/mean_abs_reasoning": 0.5059357285499573, "adv/mean_abs_step_conf": 0.7486658096313477, "adv/ratio_final_to_reasoning": 1.3608550135464639, "adv/ratio_step_to_reasoning": 1.4797646566236182, "adv/std_final_conf": 0.8923454284667969, "adv/std_reasoning": 0.757586658000946, "adv/std_step_conf": 0.9346413016319275, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8414918414918414, "calib/avg_num_step_conf": 5.01171875, "calib/ece": 0.12470588235294121, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5490196078431373, "calib/gap": 0.42314296814296787, "calib/mean_conf": 0.7281568627450979, "calib/mu_c": 0.8924358974358972, "calib/mu_w": 0.4692929292929294, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.12054901960784317, "calib/std_conf": 0.3348949716862674, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4505395683453237, "calib/step_q_c_n": 834.0, "calib/step_q_gap": 0.08880237458140383, "calib/step_q_w": 0.3617371937639199, "calib/step_q_w_n": 449.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1680.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 485.25, "completions/mean_terminated_length": 487.1529846191406, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.09386666666666667, "grad_norm": 0.03266888111829758, "kl": 0.08789825439453125, "learning_rate": 3.1111111111111116e-06, "loss": -0.1147, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.035181909799575806, "mask/share_reasoning": 0.8508784770965576, "mask/share_step_conf": 0.11003339290618896, "num_tokens": 20587784.0, "reward": 0.9882926940917969, "reward_std": 0.17865976691246033, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.8217456936836243, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8368707895278931, "step": 88 }, { "adv/mean_abs_final_conf": 0.6719787120819092, "adv/mean_abs_reasoning": 0.49249130487442017, "adv/mean_abs_step_conf": 0.7202622294425964, "adv/ratio_final_to_reasoning": 1.3644478703096217, "adv/ratio_step_to_reasoning": 1.462487199903469, "adv/std_final_conf": 0.8897674083709717, "adv/std_reasoning": 0.7575967907905579, "adv/std_step_conf": 0.9350314736366272, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7737520661157025, "calib/avg_num_step_conf": 4.375, "calib/ece": 0.178780487804878, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.45121951219512196, "calib/gap": 0.36873256198347104, "calib/mean_conf": 0.6444715447154471, "calib/mu_c": 0.82584, "calib/mu_w": 0.457107438016529, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.15756097560975607, "calib/std_conf": 0.3605882800742173, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.47622907662082514, "calib/step_q_c_n": 509.0, "calib/step_q_gap": 0.12298848742278912, "calib/step_q_w": 0.353240589198036, "calib/step_q_w_n": 611.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 555.4765625, "completions/mean_terminated_length": 555.4765625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.09493333333333333, "grad_norm": 0.02888135053217411, "kl": 0.08712005615234375, "learning_rate": 3.0833333333333336e-06, "loss": -0.0078, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.038115061819553375, "mask/share_reasoning": 0.8615081310272217, "mask/share_step_conf": 0.10037682950496674, "num_tokens": 20838874.0, "reward": 0.9215790629386902, "reward_std": 0.21322081983089447, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7409582138061523, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8154811859130859, "step": 89 }, { "adv/mean_abs_final_conf": 0.6734800338745117, "adv/mean_abs_reasoning": 0.4882771372795105, "adv/mean_abs_step_conf": 0.7472188472747803, "adv/ratio_final_to_reasoning": 1.3792987270034378, "adv/ratio_step_to_reasoning": 1.530317088852433, "adv/std_final_conf": 0.8759849667549133, "adv/std_reasoning": 0.7575928568840027, "adv/std_step_conf": 0.9344674944877625, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6883633178357254, "calib/avg_num_step_conf": 5.38671875, "calib/ece": 0.21848000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.56, "calib/gap": 0.24580149585607447, "calib/mean_conf": 0.69208, "calib/mu_c": 0.7874509803921569, "calib/mu_w": 0.5416494845360824, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14928000000000005, "calib/std_conf": 0.3698741321044228, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4144972972972973, "calib/step_q_c_n": 777.0, "calib/step_q_gap": 0.06046075244679894, "calib/step_q_w": 0.35403654485049835, "calib/step_q_w_n": 602.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2434.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 505.44921875, "completions/mean_terminated_length": 505.44921875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.096, "grad_norm": 0.02644607611000538, "kl": 0.083831787109375, "learning_rate": 3.055555555555556e-06, "loss": 0.0431, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.035364821553230286, "mask/share_reasoning": 0.845434308052063, "mask/share_step_conf": 0.11920082569122314, "num_tokens": 21071589.0, "reward": 0.9392973780632019, "reward_std": 0.2013944387435913, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.71880704164505, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8449438810348511, "step": 90 }, { "adv/mean_abs_final_conf": 0.7215551137924194, "adv/mean_abs_reasoning": 0.5888091325759888, "adv/mean_abs_step_conf": 0.7760694622993469, "adv/ratio_final_to_reasoning": 1.225448237590471, "adv/ratio_step_to_reasoning": 1.3180323119380137, "adv/std_final_conf": 0.9045106768608093, "adv/std_reasoning": 0.8099533915519714, "adv/std_step_conf": 0.934012770652771, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7291061046511628, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.18215447154471542, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.43902439024390244, "calib/gap": 0.29467005813953484, "calib/mean_conf": 0.6414227642276422, "calib/mu_c": 0.7444375, "calib/mu_w": 0.44976744186046513, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.08658536585365852, "calib/std_conf": 0.3549338770007247, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.41673768308921444, "calib/step_q_c_n": 751.0, "calib/step_q_gap": 0.10808019599799246, "calib/step_q_w": 0.308657487091222, "calib/step_q_w_n": 581.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2746.0, "completions/max_terminated_length": 2746.0, "completions/mean_length": 513.13671875, "completions/mean_terminated_length": 519.2213745117188, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.09706666666666666, "grad_norm": 0.040382903069257736, "kl": 0.08388519287109375, "learning_rate": 3.0277777777777776e-06, "loss": -0.0624, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03338609263300896, "mask/share_reasoning": 0.8500442504882812, "mask/share_step_conf": 0.1048509031534195, "num_tokens": 21310664.0, "reward": 0.9495749473571777, "reward_std": 0.18228210508823395, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7500753998756409, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8318870663642883, "step": 91 }, { "adv/mean_abs_final_conf": 0.7286385297775269, "adv/mean_abs_reasoning": 0.46878039836883545, "adv/mean_abs_step_conf": 0.7632089257240295, "adv/ratio_final_to_reasoning": 1.5543280655780227, "adv/ratio_step_to_reasoning": 1.6280734612191237, "adv/std_final_conf": 0.9158452153205872, "adv/std_reasoning": 0.7574487924575806, "adv/std_step_conf": 0.9339590072631836, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7326384188439149, "calib/avg_num_step_conf": 4.7109375, "calib/ece": 0.14370823529411766, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.45098039215686275, "calib/gap": 0.29875386489779354, "calib/mean_conf": 0.6746447058823529, "calib/mu_c": 0.7789156626506025, "calib/mu_w": 0.480161797752809, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08368627450980395, "calib/std_conf": 0.3472926672800365, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4442895442359249, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.09637650075766402, "calib/step_q_w": 0.3479130434782609, "calib/step_q_w_n": 460.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2936.0, "completions/max_terminated_length": 2936.0, "completions/mean_length": 443.125, "completions/mean_terminated_length": 443.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.09813333333333334, "grad_norm": 0.0689389705657959, "kl": 0.08803558349609375, "learning_rate": 3e-06, "loss": -0.0926, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03953507915139198, "mask/share_reasoning": 0.843379557132721, "mask/share_step_conf": 0.11708534508943558, "num_tokens": 21530824.0, "reward": 0.9877589344978333, "reward_std": 0.16079741716384888, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7843038439750671, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8630890250205994, "step": 92 }, { "adv/mean_abs_final_conf": 0.6992151737213135, "adv/mean_abs_reasoning": 0.4734675884246826, "adv/mean_abs_step_conf": 0.7822864055633545, "adv/ratio_final_to_reasoning": 1.4767962809191149, "adv/ratio_step_to_reasoning": 1.6522491183951393, "adv/std_final_conf": 0.8908438086509705, "adv/std_reasoning": 0.7207232713699341, "adv/std_step_conf": 0.9336625337600708, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.710665362035225, "calib/avg_num_step_conf": 6.00390625, "calib/ece": 0.15475431606905712, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.43824701195219123, "calib/gap": 0.2935055446836268, "calib/mean_conf": 0.6632005312084992, "calib/mu_c": 0.7859817351598173, "calib/mu_w": 0.4924761904761905, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11814077025232407, "calib/std_conf": 0.3554428247470627, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4403506493506493, "calib/step_q_c_n": 770.0, "calib/step_q_gap": 0.08889041467007569, "calib/step_q_w": 0.35146023468057364, "calib/step_q_w_n": 767.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2697.0, "completions/max_terminated_length": 2697.0, "completions/mean_length": 521.6328125, "completions/mean_terminated_length": 523.678466796875, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.0992, "grad_norm": 0.046443577855825424, "kl": 0.0834197998046875, "learning_rate": 2.9722222222222225e-06, "loss": -0.0024, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0352531261742115, "mask/share_reasoning": 0.8352970480918884, "mask/share_step_conf": 0.12554356455802917, "num_tokens": 21770138.0, "reward": 0.9602721333503723, "reward_std": 0.1766793429851532, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7515501976013184, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8588377237319946, "step": 93 }, { "adv/mean_abs_final_conf": 0.6296205520629883, "adv/mean_abs_reasoning": 0.4622064232826233, "adv/mean_abs_step_conf": 0.7736461758613586, "adv/ratio_final_to_reasoning": 1.3622064089706452, "adv/ratio_step_to_reasoning": 1.6738109573788866, "adv/std_final_conf": 0.859623908996582, "adv/std_reasoning": 0.7392425537109375, "adv/std_step_conf": 0.9339349269866943, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8167848699763592, "calib/avg_num_step_conf": 5.046875, "calib/ece": 0.1941666666666667, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5277777777777778, "calib/gap": 0.3569963580601878, "calib/mean_conf": 0.7235317460317461, "calib/mu_c": 0.8807801418439716, "calib/mu_w": 0.5237837837837838, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17908730158730163, "calib/std_conf": 0.33992854493092534, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5006395348837208, "calib/step_q_c_n": 688.0, "calib/step_q_gap": 0.1298448328969658, "calib/step_q_w": 0.370794701986755, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2404.0, "completions/max_terminated_length": 2404.0, "completions/mean_length": 441.47265625, "completions/mean_terminated_length": 444.9488220214844, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.10026666666666667, "grad_norm": 0.03776202350854874, "kl": 0.0876617431640625, "learning_rate": 2.944444444444445e-06, "loss": -0.0469, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.037618488073349, "mask/share_reasoning": 0.8373420238494873, "mask/share_step_conf": 0.1172269657254219, "num_tokens": 21991835.0, "reward": 0.9584506750106812, "reward_std": 0.16993385553359985, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7676078081130981, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8438247442245483, "step": 94 }, { "adv/mean_abs_final_conf": 0.5176414847373962, "adv/mean_abs_reasoning": 0.449894517660141, "adv/mean_abs_step_conf": 0.766070544719696, "adv/ratio_final_to_reasoning": 1.1505841134264112, "adv/ratio_step_to_reasoning": 1.7027781283131807, "adv/std_final_conf": 0.7770000100135803, "adv/std_reasoning": 0.7205990552902222, "adv/std_step_conf": 0.9334955215454102, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7958348556814031, "calib/avg_num_step_conf": 4.93359375, "calib/ece": 0.17776422764227634, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7113821138211383, "calib/gap": 0.3479378881987578, "calib/mean_conf": 0.8097154471544715, "calib/mu_c": 0.9299378881987578, "calib/mu_w": 0.582, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.16650406504065032, "calib/std_conf": 0.32364715355659757, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.49762077294685986, "calib/step_q_c_n": 828.0, "calib/step_q_gap": 0.10221847409628515, "calib/step_q_w": 0.3954022988505747, "calib/step_q_w_n": 435.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2964.0, "completions/max_terminated_length": 2964.0, "completions/mean_length": 475.5625, "completions/mean_terminated_length": 483.11114501953125, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.10133333333333333, "grad_norm": 0.030413884669542313, "kl": 0.073638916015625, "learning_rate": 2.916666666666667e-06, "loss": -0.1127, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.036571212112903595, "mask/share_reasoning": 0.8310877680778503, "mask/share_step_conf": 0.11671602725982666, "num_tokens": 22219707.0, "reward": 0.9627537727355957, "reward_std": 0.1842033863067627, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7710347771644592, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8365039825439453, "step": 95 }, { "adv/mean_abs_final_conf": 0.45578479766845703, "adv/mean_abs_reasoning": 0.3871430456638336, "adv/mean_abs_step_conf": 0.7665640115737915, "adv/ratio_final_to_reasoning": 1.1773033321234623, "adv/ratio_step_to_reasoning": 1.9800536782453764, "adv/std_final_conf": 0.7365880012512207, "adv/std_reasoning": 0.6611899733543396, "adv/std_step_conf": 0.9334518909454346, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7475511695906434, "calib/avg_num_step_conf": 5.2109375, "calib/ece": 0.15976562499999997, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.76953125, "calib/gap": 0.34566081871345034, "calib/mean_conf": 0.8459375, "calib/mu_c": 0.9485555555555556, "calib/mu_w": 0.6028947368421053, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15128906249999996, "calib/std_conf": 0.3013004042376147, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5316342857142857, "calib/step_q_c_n": 875.0, "calib/step_q_gap": 0.12318308745720508, "calib/step_q_w": 0.40845119825708065, "calib/step_q_w_n": 459.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 414.76171875, "completions/mean_terminated_length": 416.3882751464844, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.1024, "grad_norm": 0.040975358337163925, "kl": 0.114013671875, "learning_rate": 2.888888888888889e-06, "loss": 0.0935, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04015825688838959, "mask/share_reasoning": 0.8285998702049255, "mask/share_step_conf": 0.12733563780784607, "num_tokens": 22431702.0, "reward": 1.0118509531021118, "reward_std": 0.14020463824272156, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.8243891000747681, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8586878776550293, "step": 96 }, { "adv/mean_abs_final_conf": 0.6050560474395752, "adv/mean_abs_reasoning": 0.5235856771469116, "adv/mean_abs_step_conf": 0.7516707181930542, "adv/ratio_final_to_reasoning": 1.1556008383128555, "adv/ratio_step_to_reasoning": 1.4356212383215072, "adv/std_final_conf": 0.8272944688796997, "adv/std_reasoning": 0.7754360437393188, "adv/std_step_conf": 0.9345774054527283, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6281774109014676, "calib/avg_num_step_conf": 5.125, "calib/ece": 0.31728, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.768, "calib/gap": 0.18736242138364767, "calib/mean_conf": 0.8415999999999999, "calib/mu_c": 0.9210416666666666, "calib/mu_w": 0.733679245283019, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.29144, "calib/std_conf": 0.3124481396968143, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5813196480938417, "calib/step_q_c_n": 682.0, "calib/step_q_gap": 0.16146250523669892, "calib/step_q_w": 0.4198571428571428, "calib/step_q_w_n": 630.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3038.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 472.0859375, "completions/mean_terminated_length": 472.0859375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.10346666666666667, "grad_norm": 0.048959359526634216, "kl": 0.078338623046875, "learning_rate": 2.861111111111111e-06, "loss": -0.0059, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.037389181554317474, "mask/share_reasoning": 0.839634358882904, "mask/share_step_conf": 0.12297643721103668, "num_tokens": 22657628.0, "reward": 0.905328631401062, "reward_std": 0.22340336441993713, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6632086038589478, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8396360874176025, "step": 97 }, { "adv/mean_abs_final_conf": 0.5784022808074951, "adv/mean_abs_reasoning": 0.532549262046814, "adv/mean_abs_step_conf": 0.7428262233734131, "adv/ratio_final_to_reasoning": 1.0861009901402332, "adv/ratio_step_to_reasoning": 1.3948497844470107, "adv/std_final_conf": 0.7927812337875366, "adv/std_reasoning": 0.7927854061126709, "adv/std_step_conf": 0.9343677759170532, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6593717992488903, "calib/avg_num_step_conf": 4.671875, "calib/ece": 0.2732520325203251, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7642276422764228, "calib/gap": 0.2434988050529191, "calib/mean_conf": 0.8292682926829268, "calib/mu_c": 0.9292413793103449, "calib/mu_w": 0.6857425742574258, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.2565447154471544, "calib/std_conf": 0.325585578104195, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5994912559618442, "calib/step_q_c_n": 629.0, "calib/step_q_gap": 0.11019495966554782, "calib/step_q_w": 0.48929629629629634, "calib/step_q_w_n": 567.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2677.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 503.44921875, "completions/mean_terminated_length": 503.44921875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.10453333333333334, "grad_norm": 0.026320848613977432, "kl": 0.07251739501953125, "learning_rate": 2.8333333333333335e-06, "loss": -0.0586, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.036959365010261536, "mask/share_reasoning": 0.8525199890136719, "mask/share_step_conf": 0.11052063852548599, "num_tokens": 22892695.0, "reward": 0.8927949666976929, "reward_std": 0.22106841206550598, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.676842987537384, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8048408031463623, "step": 98 }, { "adv/mean_abs_final_conf": 0.6866989135742188, "adv/mean_abs_reasoning": 0.6365125775337219, "adv/mean_abs_step_conf": 0.7182142734527588, "adv/ratio_final_to_reasoning": 1.0788457884602256, "adv/ratio_step_to_reasoning": 1.1283583369799293, "adv/std_final_conf": 0.8761346936225891, "adv/std_reasoning": 0.8589560389518738, "adv/std_step_conf": 0.9351794123649597, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.727749229188078, "calib/avg_num_step_conf": 5.62109375, "calib/ece": 0.298688524590164, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.569672131147541, "calib/gap": 0.2896731757451182, "calib/mean_conf": 0.6904098360655739, "calib/mu_c": 0.8554285714285713, "calib/mu_w": 0.5657553956834531, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2793852459016394, "calib/std_conf": 0.37926351741362724, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5856870229007634, "calib/step_q_c_n": 524.0, "calib/step_q_gap": 0.1374465857422934, "calib/step_q_w": 0.44824043715846995, "calib/step_q_w_n": 915.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2531.0, "completions/max_terminated_length": 2531.0, "completions/mean_length": 547.33203125, "completions/mean_terminated_length": 560.468017578125, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.1056, "grad_norm": 0.03607296198606491, "kl": 0.0640716552734375, "learning_rate": 2.805555555555556e-06, "loss": -0.0437, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03173251822590828, "mask/share_reasoning": 0.8410789966583252, "mask/share_step_conf": 0.10375095903873444, "num_tokens": 23138612.0, "reward": 0.8512870073318481, "reward_std": 0.25130918622016907, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.6493609547615051, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7813380360603333, "step": 99 }, { "adv/mean_abs_final_conf": 0.5975271463394165, "adv/mean_abs_reasoning": 0.4342886209487915, "adv/mean_abs_step_conf": 0.7729626893997192, "adv/ratio_final_to_reasoning": 1.3758756677391117, "adv/ratio_step_to_reasoning": 1.7798363855608872, "adv/std_final_conf": 0.838344395160675, "adv/std_reasoning": 0.6816809773445129, "adv/std_step_conf": 0.9339935183525085, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7793527508090615, "calib/avg_num_step_conf": 5.21484375, "calib/ece": 0.18007075098814238, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6442687747035574, "calib/gap": 0.41364636245954683, "calib/mean_conf": 0.7285458498023716, "calib/mu_c": 0.8969473333333333, "calib/mu_w": 0.4833009708737865, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1578656126482214, "calib/std_conf": 0.3833230272162609, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5946691176470589, "calib/step_q_c_n": 816.0, "calib/step_q_gap": 0.1484070752578489, "calib/step_q_w": 0.44626204238920997, "calib/step_q_w_n": 519.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1848.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 501.37890625, "completions/mean_terminated_length": 507.3241271972656, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.10666666666666667, "grad_norm": 0.033330369740724564, "kl": 0.06787872314453125, "learning_rate": 2.7777777777777783e-06, "loss": -0.0849, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03490392118692398, "mask/share_reasoning": 0.8391332626342773, "mask/share_step_conf": 0.11424408107995987, "num_tokens": 23374373.0, "reward": 0.9700980186462402, "reward_std": 0.1729770302772522, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7836803197860718, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8416720628738403, "step": 100 }, { "adv/mean_abs_final_conf": 0.656168520450592, "adv/mean_abs_reasoning": 0.4810579717159271, "adv/mean_abs_step_conf": 0.7474485635757446, "adv/ratio_final_to_reasoning": 1.3640113230221464, "adv/ratio_step_to_reasoning": 1.5537598533282921, "adv/std_final_conf": 0.8590138554573059, "adv/std_reasoning": 0.7576479911804199, "adv/std_step_conf": 0.9346694946289062, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7411160831797843, "calib/avg_num_step_conf": 5.92578125, "calib/ece": 0.24054412955465587, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.48582995951417, "calib/gap": 0.3265280600157936, "calib/mean_conf": 0.6508728744939272, "calib/mu_c": 0.824051724137931, "calib/mu_w": 0.49752366412213744, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.21089068825910934, "calib/std_conf": 0.3882587265444154, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5500429184549357, "calib/step_q_c_n": 699.0, "calib/step_q_gap": 0.09687665928623151, "calib/step_q_w": 0.4531662591687042, "calib/step_q_w_n": 818.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2952.0, "completions/max_terminated_length": 2952.0, "completions/mean_length": 559.57421875, "completions/mean_terminated_length": 563.9802856445312, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.10773333333333333, "grad_norm": 0.043247781693935394, "kl": 0.07190704345703125, "learning_rate": 2.7500000000000004e-06, "loss": 0.0092, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033356837928295135, "mask/share_reasoning": 0.8435466885566711, "mask/share_step_conf": 0.11528396606445312, "num_tokens": 23624616.0, "reward": 0.8842967748641968, "reward_std": 0.21739190816879272, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.7008475065231323, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7849335074424744, "step": 101 }, { "adv/mean_abs_final_conf": 0.5095280408859253, "adv/mean_abs_reasoning": 0.31084319949150085, "adv/mean_abs_step_conf": 0.7629257440567017, "adv/ratio_final_to_reasoning": 1.6391802739112422, "adv/ratio_step_to_reasoning": 2.454374891600489, "adv/std_final_conf": 0.7696377635002136, "adv/std_reasoning": 0.6184869408607483, "adv/std_step_conf": 0.9334552884101868, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.783984165324745, "calib/avg_num_step_conf": 5.06640625, "calib/ece": 0.15271732283464579, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6377952755905512, "calib/gap": 0.42152310789049907, "calib/mean_conf": 0.7429519685039369, "calib/mu_c": 0.8956296296296296, "calib/mu_w": 0.4741065217391305, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12893700787401588, "calib/std_conf": 0.36587584193100725, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5906177606177606, "calib/step_q_c_n": 777.0, "calib/step_q_gap": 0.16017218369468372, "calib/step_q_w": 0.4304455769230769, "calib/step_q_w_n": 520.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1458.0, "completions/max_terminated_length": 1458.0, "completions/mean_length": 408.7421875, "completions/mean_terminated_length": 410.3451232910156, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.1088, "grad_norm": 0.03269050642848015, "kl": 0.0802764892578125, "learning_rate": 2.7222222222222224e-06, "loss": -0.0296, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04328569769859314, "mask/share_reasoning": 0.8195648193359375, "mask/share_step_conf": 0.13324323296546936, "num_tokens": 23835950.0, "reward": 0.9991936087608337, "reward_std": 0.1344832181930542, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.8124216198921204, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8609656095504761, "step": 102 }, { "adv/mean_abs_final_conf": 0.5931665897369385, "adv/mean_abs_reasoning": 0.4472111463546753, "adv/mean_abs_step_conf": 0.7450423240661621, "adv/ratio_final_to_reasoning": 1.3263680804290787, "adv/ratio_step_to_reasoning": 1.665974406360798, "adv/std_final_conf": 0.8274978995323181, "adv/std_reasoning": 0.7392292022705078, "adv/std_step_conf": 0.9343953132629395, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7972413793103449, "calib/avg_num_step_conf": 6.109375, "calib/ece": 0.1719200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.64, "calib/gap": 0.43064039408866994, "calib/mean_conf": 0.7312000000000001, "calib/mu_c": 0.9120689655172414, "calib/mu_w": 0.48142857142857143, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16156000000000012, "calib/std_conf": 0.3776561398944813, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6325307391091531, "calib/step_q_c_n": 681.0, "calib/step_q_gap": 0.19963153186113503, "calib/step_q_w": 0.4328992072480181, "calib/step_q_w_n": 883.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2704.0, "completions/max_terminated_length": 2704.0, "completions/mean_length": 590.5390625, "completions/mean_terminated_length": 590.5390625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.10986666666666667, "grad_norm": 0.02490939199924469, "kl": 0.06413650512695312, "learning_rate": 2.6944444444444444e-06, "loss": 0.1038, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03427442908287048, "mask/share_reasoning": 0.8512352108955383, "mask/share_step_conf": 0.11449037492275238, "num_tokens": 24091680.0, "reward": 0.9671297073364258, "reward_std": 0.19709224998950958, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7819554805755615, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8437101244926453, "step": 103 }, { "adv/mean_abs_final_conf": 0.6492934823036194, "adv/mean_abs_reasoning": 0.47703713178634644, "adv/mean_abs_step_conf": 0.7521624565124512, "adv/ratio_final_to_reasoning": 1.3610963152327993, "adv/ratio_step_to_reasoning": 1.5767377556037436, "adv/std_final_conf": 0.8646968603134155, "adv/std_reasoning": 0.7393373847007751, "adv/std_step_conf": 0.9346559643745422, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.794665404040404, "calib/avg_num_step_conf": 5.3984375, "calib/ece": 0.16884920634920625, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.42857142857142855, "calib/gap": 0.45039393939393946, "calib/mean_conf": 0.5525793650793651, "calib/mu_c": 0.7885000000000001, "calib/mu_w": 0.33810606060606063, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12261904761904756, "calib/std_conf": 0.4203844773306327, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6118739352640544, "calib/step_q_c_n": 587.0, "calib/step_q_gap": 0.17602487866028083, "calib/step_q_w": 0.4358490566037736, "calib/step_q_w_n": 795.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2705.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 517.01171875, "completions/mean_terminated_length": 517.01171875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.11093333333333333, "grad_norm": 0.04363548383116722, "kl": 0.0774993896484375, "learning_rate": 2.666666666666667e-06, "loss": 0.04, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03508147969841957, "mask/share_reasoning": 0.8457739949226379, "mask/share_step_conf": 0.119144506752491, "num_tokens": 24330715.0, "reward": 0.941875696182251, "reward_std": 0.17754782736301422, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7803089618682861, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8128174543380737, "step": 104 }, { "adv/mean_abs_final_conf": 0.6836636662483215, "adv/mean_abs_reasoning": 0.5900191068649292, "adv/mean_abs_step_conf": 0.7682121992111206, "adv/ratio_final_to_reasoning": 1.1587144522843902, "adv/ratio_step_to_reasoning": 1.3020124098913028, "adv/std_final_conf": 0.889095664024353, "adv/std_reasoning": 0.8100023865699768, "adv/std_step_conf": 0.9345039129257202, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8001728953318261, "calib/avg_num_step_conf": 5.3828125, "calib/ece": 0.15961686746987958, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5140562248995983, "calib/gap": 0.4813849049075675, "calib/mean_conf": 0.6007044176706827, "calib/mu_c": 0.7998315068493151, "calib/mu_w": 0.3184466019417476, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.08698795180722894, "calib/std_conf": 0.42927057589576506, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5579889807162535, "calib/step_q_c_n": 726.0, "calib/step_q_gap": 0.13505953286349276, "calib/step_q_w": 0.4229294478527607, "calib/step_q_w_n": 652.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1921.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 476.15234375, "completions/mean_terminated_length": 481.7984313964844, "completions/min_length": 0.0, "completions/min_terminated_length": 22.0, "epoch": 0.112, "grad_norm": 0.0338783822953701, "kl": 0.072052001953125, "learning_rate": 2.6388888888888893e-06, "loss": -0.1031, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03626061603426933, "mask/share_reasoning": 0.8372711539268494, "mask/share_step_conf": 0.11474946141242981, "num_tokens": 24558370.0, "reward": 0.9632641673088074, "reward_std": 0.21223922073841095, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7844381332397461, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8334963917732239, "step": 105 }, { "adv/mean_abs_final_conf": 0.6210803389549255, "adv/mean_abs_reasoning": 0.4079238772392273, "adv/mean_abs_step_conf": 0.7573671340942383, "adv/ratio_final_to_reasoning": 1.5225398011960267, "adv/ratio_step_to_reasoning": 1.856638398369801, "adv/std_final_conf": 0.8601810932159424, "adv/std_reasoning": 0.7012488842010498, "adv/std_step_conf": 0.9338719248771667, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7499840835296364, "calib/avg_num_step_conf": 5.27734375, "calib/ece": 0.20079365079365077, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.48412698412698413, "calib/gap": 0.3779970713694532, "calib/mean_conf": 0.5992063492063492, "calib/mu_c": 0.7687050359712231, "calib/mu_w": 0.3907079646017699, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1242063492063492, "calib/std_conf": 0.41771791798689534, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5683842794759826, "calib/step_q_c_n": 687.0, "calib/step_q_gap": 0.14180295417477778, "calib/step_q_w": 0.4265813253012048, "calib/step_q_w_n": 664.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2447.0, "completions/max_terminated_length": 2447.0, "completions/mean_length": 475.50390625, "completions/mean_terminated_length": 477.36865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.11306666666666666, "grad_norm": 0.037723153829574585, "kl": 0.123046875, "learning_rate": 2.6111111111111113e-06, "loss": -0.0421, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03565295785665512, "mask/share_reasoning": 0.8457275629043579, "mask/share_step_conf": 0.11471326649188995, "num_tokens": 24784683.0, "reward": 0.9509889483451843, "reward_std": 0.17380815744400024, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7509718537330627, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8455371856689453, "step": 106 }, { "adv/mean_abs_final_conf": 0.6779407262802124, "adv/mean_abs_reasoning": 0.493292897939682, "adv/mean_abs_step_conf": 0.7233132123947144, "adv/ratio_final_to_reasoning": 1.3743168188955124, "adv/ratio_step_to_reasoning": 1.4662956134494325, "adv/std_final_conf": 0.8676726818084717, "adv/std_reasoning": 0.7575341463088989, "adv/std_step_conf": 0.9344541430473328, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6941146102219928, "calib/avg_num_step_conf": 5.72265625, "calib/ece": 0.24161067193675892, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.549407114624506, "calib/gap": 0.29727607124419203, "calib/mean_conf": 0.6545948616600791, "calib/mu_c": 0.7767953020134228, "calib/mu_w": 0.4795192307692308, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15363636363636368, "calib/std_conf": 0.3998723349794255, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5191130012150669, "calib/step_q_c_n": 823.0, "calib/step_q_gap": 0.11733730028048733, "calib/step_q_w": 0.40177570093457954, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2696.0, "completions/max_terminated_length": 2696.0, "completions/mean_length": 468.5078125, "completions/mean_terminated_length": 468.5078125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.11413333333333334, "grad_norm": 0.0470091886818409, "kl": 0.07814788818359375, "learning_rate": 2.5833333333333337e-06, "loss": 0.0497, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03784269094467163, "mask/share_reasoning": 0.8325836658477783, "mask/share_step_conf": 0.12957364320755005, "num_tokens": 25009237.0, "reward": 0.9420108795166016, "reward_std": 0.18782854080200195, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7289911508560181, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8409680128097534, "step": 107 }, { "adv/mean_abs_final_conf": 0.5114879608154297, "adv/mean_abs_reasoning": 0.3971118927001953, "adv/mean_abs_step_conf": 0.7521694898605347, "adv/ratio_final_to_reasoning": 1.2880197501452924, "adv/ratio_step_to_reasoning": 1.8940996320863013, "adv/std_final_conf": 0.7579714059829712, "adv/std_reasoning": 0.6816282868385315, "adv/std_step_conf": 0.9336187243461609, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6744449846054124, "calib/avg_num_step_conf": 5.8125, "calib/ece": 0.22449275362318838, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.691699604743083, "calib/gap": 0.30871657754010695, "calib/mean_conf": 0.7433333333333333, "calib/mu_c": 0.8238680926916221, "calib/mu_w": 0.5151515151515151, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11434782608695654, "calib/std_conf": 0.3995920185528892, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5359702970297029, "calib/step_q_c_n": 1010.0, "calib/step_q_gap": 0.14574017150669027, "calib/step_q_w": 0.3902301255230126, "calib/step_q_w_n": 478.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2290.0, "completions/max_terminated_length": 2290.0, "completions/mean_length": 521.9921875, "completions/mean_terminated_length": 521.9921875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.1152, "grad_norm": 0.031734488904476166, "kl": 0.0790863037109375, "learning_rate": 2.5555555555555557e-06, "loss": -0.0242, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03476386517286301, "mask/share_reasoning": 0.8393080234527588, "mask/share_step_conf": 0.12592805922031403, "num_tokens": 25246099.0, "reward": 0.9783648252487183, "reward_std": 0.16985079646110535, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.7575603723526001, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8554192185401917, "step": 108 }, { "adv/mean_abs_final_conf": 0.6080666780471802, "adv/mean_abs_reasoning": 0.44817543029785156, "adv/mean_abs_step_conf": 0.7608993053436279, "adv/ratio_final_to_reasoning": 1.356760404386887, "adv/ratio_step_to_reasoning": 1.6977711268954305, "adv/std_final_conf": 0.8206971883773804, "adv/std_reasoning": 0.7392958402633667, "adv/std_step_conf": 0.9336312413215637, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.8191, "calib/avg_num_step_conf": 6.29296875, "calib/ece": 0.16285714285714276, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.4489795918367347, "calib/gap": 0.5080299999999999, "calib/mean_conf": 0.5344489795918367, "calib/mu_c": 0.7832799999999999, "calib/mu_w": 0.27525, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09355102040816317, "calib/std_conf": 0.4471490214647886, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5017307152875176, "calib/step_q_c_n": 713.0, "calib/step_q_gap": 0.199670581657228, "calib/step_q_w": 0.30206013363028955, "calib/step_q_w_n": 898.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 556.76953125, "completions/mean_terminated_length": 563.37158203125, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.11626666666666667, "grad_norm": 0.060035791248083115, "kl": 0.07425689697265625, "learning_rate": 2.5277777777777778e-06, "loss": -0.0473, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.031296491622924805, "mask/share_reasoning": 0.8337397575378418, "mask/share_step_conf": 0.12324501574039459, "num_tokens": 25493232.0, "reward": 0.9433398842811584, "reward_std": 0.19780653715133667, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7644991874694824, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8346804976463318, "step": 109 }, { "adv/mean_abs_final_conf": 0.6949703693389893, "adv/mean_abs_reasoning": 0.5166112184524536, "adv/mean_abs_step_conf": 0.7524176836013794, "adv/ratio_final_to_reasoning": 1.345248311526845, "adv/ratio_step_to_reasoning": 1.4564485956292261, "adv/std_final_conf": 0.8908286094665527, "adv/std_reasoning": 0.7576285004615784, "adv/std_step_conf": 0.9336581230163574, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6592592592592593, "calib/avg_num_step_conf": 5.07421875, "calib/ece": 0.3303187250996016, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.46215139442231074, "calib/gap": 0.22483588761174977, "calib/mean_conf": 0.5544621513944223, "calib/mu_c": 0.6583703703703704, "calib/mu_w": 0.4335344827586206, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1734661354581673, "calib/std_conf": 0.4403701919272022, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49868113522537566, "calib/step_q_c_n": 599.0, "calib/step_q_gap": 0.1533811352253756, "calib/step_q_w": 0.34530000000000005, "calib/step_q_w_n": 700.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2472.0, "completions/max_terminated_length": 2472.0, "completions/mean_length": 490.71875, "completions/mean_terminated_length": 490.71875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.11733333333333333, "grad_norm": 0.031973760575056076, "kl": 0.079925537109375, "learning_rate": 2.5e-06, "loss": 0.045, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.036116331815719604, "mask/share_reasoning": 0.8464046716690063, "mask/share_step_conf": 0.11747899651527405, "num_tokens": 25723776.0, "reward": 0.89471036195755, "reward_std": 0.18601158261299133, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.655937910079956, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.831920325756073, "step": 110 }, { "adv/mean_abs_final_conf": 0.5667400360107422, "adv/mean_abs_reasoning": 0.47426918148994446, "adv/mean_abs_step_conf": 0.7553653717041016, "adv/ratio_final_to_reasoning": 1.194975465684477, "adv/ratio_step_to_reasoning": 1.5926933504957606, "adv/std_final_conf": 0.7973593473434448, "adv/std_reasoning": 0.7393887639045715, "adv/std_step_conf": 0.9348477721214294, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7744879201680672, "calib/avg_num_step_conf": 5.15234375, "calib/ece": 0.21139784946236562, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5080645161290323, "calib/gap": 0.45991946778711484, "calib/mean_conf": 0.5748924731182795, "calib/mu_c": 0.7825980392156863, "calib/mu_w": 0.3226785714285714, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11895161290322583, "calib/std_conf": 0.45134197262340864, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5010180623973728, "calib/step_q_c_n": 609.0, "calib/step_q_gap": 0.18886313281990802, "calib/step_q_w": 0.31215492957746477, "calib/step_q_w_n": 710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2971.0, "completions/max_terminated_length": 2971.0, "completions/mean_length": 515.125, "completions/mean_terminated_length": 519.1810913085938, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.1184, "grad_norm": 0.030626846477389336, "kl": 0.07135009765625, "learning_rate": 2.4722222222222226e-06, "loss": 0.041, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.037555031478405, "mask/share_reasoning": 0.8396614789962769, "mask/share_step_conf": 0.11497093737125397, "num_tokens": 25963056.0, "reward": 0.9370782375335693, "reward_std": 0.18119025230407715, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7514935731887817, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8226628303527832, "step": 111 }, { "adv/mean_abs_final_conf": 0.6594914793968201, "adv/mean_abs_reasoning": 0.5638391375541687, "adv/mean_abs_step_conf": 0.7696518898010254, "adv/ratio_final_to_reasoning": 1.1696447363650098, "adv/ratio_step_to_reasoning": 1.3650203374310532, "adv/std_final_conf": 0.8604232668876648, "adv/std_reasoning": 0.8266275525093079, "adv/std_step_conf": 0.9347512722015381, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7883694733877782, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.23908333333333331, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": 0.4711236271472825, "calib/mean_conf": 0.3926666666666667, "calib/mu_c": 0.6007462686567164, "calib/mu_w": 0.12962264150943395, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.03670833333333334, "calib/std_conf": 0.44674700769998327, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.43881305637982193, "calib/step_q_c_n": 674.0, "calib/step_q_gap": 0.18350478570313017, "calib/step_q_w": 0.25530827067669176, "calib/step_q_w_n": 665.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2900.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 571.58203125, "completions/mean_terminated_length": 585.300048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 218.0, "epoch": 0.11946666666666667, "grad_norm": 0.03158825263381004, "kl": 0.06920623779296875, "learning_rate": 2.4444444444444447e-06, "loss": -0.0995, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.028736168518662453, "mask/share_reasoning": 0.8447258472442627, "mask/share_step_conf": 0.1031005010008812, "num_tokens": 26217301.0, "reward": 0.8999679088592529, "reward_std": 0.23615305125713348, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7113093733787537, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7964389324188232, "step": 112 }, { "adv/mean_abs_final_conf": 0.6749004125595093, "adv/mean_abs_reasoning": 0.47871798276901245, "adv/mean_abs_step_conf": 0.7707317471504211, "adv/ratio_final_to_reasoning": 1.409807938811351, "adv/ratio_step_to_reasoning": 1.6099912158977931, "adv/std_final_conf": 0.8600090742111206, "adv/std_reasoning": 0.7206716537475586, "adv/std_step_conf": 0.9334895014762878, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7614010989010989, "calib/avg_num_step_conf": 6.1484375, "calib/ece": 0.23681274900398408, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.41832669322709165, "calib/gap": 0.4297637362637362, "calib/mean_conf": 0.5176892430278884, "calib/mu_c": 0.6735, "calib/mu_w": 0.24373626373626378, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05852589641434269, "calib/std_conf": 0.44088764033280836, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42194539249146756, "calib/step_q_c_n": 879.0, "calib/step_q_gap": 0.18594539249146755, "calib/step_q_w": 0.23600000000000002, "calib/step_q_w_n": 695.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 482.6875, "completions/mean_terminated_length": 486.4881896972656, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.12053333333333334, "grad_norm": 0.04572642967104912, "kl": 0.08179473876953125, "learning_rate": 2.4166666666666667e-06, "loss": -0.0577, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035985179245471954, "mask/share_reasoning": 0.8264555931091309, "mask/share_step_conf": 0.129746675491333, "num_tokens": 26446069.0, "reward": 0.9624725580215454, "reward_std": 0.17017316818237305, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7439906597137451, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8598606586456299, "step": 113 }, { "adv/mean_abs_final_conf": 0.6364820599555969, "adv/mean_abs_reasoning": 0.38433846831321716, "adv/mean_abs_step_conf": 0.7515975832939148, "adv/ratio_final_to_reasoning": 1.6560456796036742, "adv/ratio_step_to_reasoning": 1.9555616865325052, "adv/std_final_conf": 0.8455886840820312, "adv/std_reasoning": 0.6612743735313416, "adv/std_step_conf": 0.9331404566764832, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8306848404255318, "calib/avg_num_step_conf": 5.7890625, "calib/ece": 0.1618503937007874, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4881889763779528, "calib/gap": 0.528344414893617, "calib/mean_conf": 0.5772834645669291, "calib/mu_c": 0.7728125, "calib/mu_w": 0.244468085106383, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.054606299212598436, "calib/std_conf": 0.43322824806726284, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46794717887154863, "calib/step_q_c_n": 833.0, "calib/step_q_gap": 0.1828470247883437, "calib/step_q_w": 0.28510015408320494, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2288.0, "completions/max_terminated_length": 2288.0, "completions/mean_length": 510.55078125, "completions/mean_terminated_length": 512.552978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.1216, "grad_norm": 0.04408552497625351, "kl": 0.0717620849609375, "learning_rate": 2.388888888888889e-06, "loss": 0.0212, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03622880205512047, "mask/share_reasoning": 0.8335362672805786, "mask/share_step_conf": 0.12632864713668823, "num_tokens": 26681794.0, "reward": 1.0109832286834717, "reward_std": 0.14032113552093506, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.816330075263977, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.882198691368103, "step": 114 }, { "adv/mean_abs_final_conf": 0.7068969011306763, "adv/mean_abs_reasoning": 0.4891188144683838, "adv/mean_abs_step_conf": 0.7457759380340576, "adv/ratio_final_to_reasoning": 1.445245777141066, "adv/ratio_step_to_reasoning": 1.5247336965449403, "adv/std_final_conf": 0.8840060830116272, "adv/std_reasoning": 0.7392686009407043, "adv/std_step_conf": 0.934516966342926, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6875624687656172, "calib/avg_num_step_conf": 5.09375, "calib/ece": 0.29598425196850386, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.43700787401574803, "calib/gap": 0.2785732133933033, "calib/mean_conf": 0.5239370078740158, "calib/mu_c": 0.6511594202898551, "calib/mu_w": 0.37258620689655175, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13830708661417318, "calib/std_conf": 0.44368208359993827, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42582047685834507, "calib/step_q_c_n": 713.0, "calib/step_q_gap": 0.08035516382958025, "calib/step_q_w": 0.3454653130287648, "calib/step_q_w_n": 591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 441.0546875, "completions/mean_terminated_length": 442.7843322753906, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.12266666666666666, "grad_norm": 0.05775079503655434, "kl": 0.08318328857421875, "learning_rate": 2.361111111111111e-06, "loss": 0.008, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03622948005795479, "mask/share_reasoning": 0.8380911350250244, "mask/share_step_conf": 0.12177319079637527, "num_tokens": 26899968.0, "reward": 0.9117941856384277, "reward_std": 0.18091896176338196, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6874749660491943, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.828300952911377, "step": 115 }, { "adv/mean_abs_final_conf": 0.6373252868652344, "adv/mean_abs_reasoning": 0.3526563048362732, "adv/mean_abs_step_conf": 0.7363879680633545, "adv/ratio_final_to_reasoning": 1.807213647182981, "adv/ratio_step_to_reasoning": 2.0881179719875855, "adv/std_final_conf": 0.8416139483451843, "adv/std_reasoning": 0.6403775811195374, "adv/std_step_conf": 0.9332254528999329, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7618092105263158, "calib/avg_num_step_conf": 5.59765625, "calib/ece": 0.22785714285714284, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.46825396825396826, "calib/gap": 0.3854131578947369, "calib/mean_conf": 0.5585714285714285, "calib/mu_c": 0.7115131578947369, "calib/mu_w": 0.3261, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09162698412698413, "calib/std_conf": 0.44005449569760496, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4407152317880795, "calib/step_q_c_n": 755.0, "calib/step_q_gap": 0.1707742288382269, "calib/step_q_w": 0.26994100294985257, "calib/step_q_w_n": 678.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2713.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 513.18359375, "completions/mean_terminated_length": 517.2244262695312, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.12373333333333333, "grad_norm": 0.049823228269815445, "kl": 0.07218170166015625, "learning_rate": 2.3333333333333336e-06, "loss": -0.0028, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03470658138394356, "mask/share_reasoning": 0.8393930792808533, "mask/share_step_conf": 0.11808786541223526, "num_tokens": 27135863.0, "reward": 0.9575741291046143, "reward_std": 0.1541958898305893, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7338913679122925, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8664129972457886, "step": 116 }, { "adv/mean_abs_final_conf": 0.7075223326683044, "adv/mean_abs_reasoning": 0.48216667771339417, "adv/mean_abs_step_conf": 0.7466195821762085, "adv/ratio_final_to_reasoning": 1.467381230124875, "adv/ratio_step_to_reasoning": 1.548467815563166, "adv/std_final_conf": 0.8886599540710449, "adv/std_reasoning": 0.7393694519996643, "adv/std_step_conf": 0.9349843263626099, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6939594749585829, "calib/avg_num_step_conf": 5.55078125, "calib/ece": 0.2669322709163347, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.46613545816733065, "calib/gap": 0.281900089206066, "calib/mean_conf": 0.5914741035856573, "calib/mu_c": 0.7408474576271186, "calib/mu_w": 0.45894736842105255, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1941434262948208, "calib/std_conf": 0.41703351247183346, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.44488524590163936, "calib/step_q_c_n": 610.0, "calib/step_q_gap": 0.10553876008166396, "calib/step_q_w": 0.3393464858199754, "calib/step_q_w_n": 811.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2839.0, "completions/max_terminated_length": 2839.0, "completions/mean_length": 511.55078125, "completions/mean_terminated_length": 513.556884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.1248, "grad_norm": 0.043811291456222534, "kl": 0.07315826416015625, "learning_rate": 2.305555555555556e-06, "loss": -0.0016, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03278231620788574, "mask/share_reasoning": 0.8445600271224976, "mask/share_step_conf": 0.1187513917684555, "num_tokens": 27373420.0, "reward": 0.9010759592056274, "reward_std": 0.21563945710659027, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.681955873966217, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8334771990776062, "step": 117 }, { "adv/mean_abs_final_conf": 0.6128525733947754, "adv/mean_abs_reasoning": 0.3557014465332031, "adv/mean_abs_step_conf": 0.7667481899261475, "adv/ratio_final_to_reasoning": 1.7229409083596976, "adv/ratio_step_to_reasoning": 2.155594804010939, "adv/std_final_conf": 0.8230372071266174, "adv/std_reasoning": 0.640249490737915, "adv/std_step_conf": 0.9342384934425354, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.755758695795657, "calib/avg_num_step_conf": 6.54296875, "calib/ece": 0.22661290322580652, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5887096774193549, "calib/gap": 0.33595274239324147, "calib/mean_conf": 0.7010483870967742, "calib/mu_c": 0.848705035971223, "calib/mu_w": 0.5127522935779816, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.18358870967741941, "calib/std_conf": 0.3858815181448648, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.42637698898408816, "calib/step_q_c_n": 817.0, "calib/step_q_gap": 0.10349820110530022, "calib/step_q_w": 0.32287878787878793, "calib/step_q_w_n": 858.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 562.41015625, "completions/mean_terminated_length": 566.8385620117188, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.12586666666666665, "grad_norm": 0.038998398929834366, "kl": 0.0649261474609375, "learning_rate": 2.277777777777778e-06, "loss": -0.0288, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03201957419514656, "mask/share_reasoning": 0.8347534537315369, "mask/share_step_conf": 0.12541446089744568, "num_tokens": 27621405.0, "reward": 0.9234127402305603, "reward_std": 0.18003970384597778, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7270601391792297, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.817421555519104, "step": 118 }, { "adv/mean_abs_final_conf": 0.69493168592453, "adv/mean_abs_reasoning": 0.5836117267608643, "adv/mean_abs_step_conf": 0.7426487803459167, "adv/ratio_final_to_reasoning": 1.190743184311098, "adv/ratio_step_to_reasoning": 1.2725048971646489, "adv/std_final_conf": 0.8770858645439148, "adv/std_reasoning": 0.8099712133407593, "adv/std_step_conf": 0.9345157146453857, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7353337374553781, "calib/avg_num_step_conf": 5.81640625, "calib/ece": 0.24382661290322566, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5564516129032258, "calib/gap": 0.3643693001953257, "calib/mean_conf": 0.6318185483870967, "calib/mu_c": 0.7802108843537415, "calib/mu_w": 0.4158415841584158, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14145161290322567, "calib/std_conf": 0.4348907947584482, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4687263556116015, "calib/step_q_c_n": 793.0, "calib/step_q_gap": 0.161686125726544, "calib/step_q_w": 0.30704022988505747, "calib/step_q_w_n": 696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2572.0, "completions/max_terminated_length": 2572.0, "completions/mean_length": 559.5546875, "completions/mean_terminated_length": 566.1897583007812, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.12693333333333334, "grad_norm": 0.03349678963422775, "kl": 0.06784820556640625, "learning_rate": 2.25e-06, "loss": -0.0964, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03033093363046646, "mask/share_reasoning": 0.8460862636566162, "mask/share_step_conf": 0.11186406016349792, "num_tokens": 27869715.0, "reward": 0.9392236471176147, "reward_std": 0.23281517624855042, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7167088985443115, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8539257645606995, "step": 119 }, { "adv/mean_abs_final_conf": 0.616726279258728, "adv/mean_abs_reasoning": 0.47547075152397156, "adv/mean_abs_step_conf": 0.7524898052215576, "adv/ratio_final_to_reasoning": 1.2970856299404463, "adv/ratio_step_to_reasoning": 1.5826205982380386, "adv/std_final_conf": 0.8283965587615967, "adv/std_reasoning": 0.7575518488883972, "adv/std_step_conf": 0.9344828128814697, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.8128576474743907, "calib/avg_num_step_conf": 5.22265625, "calib/ece": 0.19245901639344262, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.569672131147541, "calib/gap": 0.48753373366301667, "calib/mean_conf": 0.6265573770491804, "calib/mu_c": 0.8163758389261745, "calib/mu_w": 0.3288421052631578, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.10418032786885248, "calib/std_conf": 0.44613416068580203, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.43939470365699873, "calib/step_q_c_n": 793.0, "calib/step_q_gap": 0.13764838012758696, "calib/step_q_w": 0.3017463235294118, "calib/step_q_w_n": 544.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2755.0, "completions/max_terminated_length": 2755.0, "completions/mean_length": 499.078125, "completions/mean_terminated_length": 503.00787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.128, "grad_norm": 0.04359474778175354, "kl": 0.078948974609375, "learning_rate": 2.222222222222222e-06, "loss": 0.0079, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03324298560619354, "mask/share_reasoning": 0.845354437828064, "mask/share_step_conf": 0.11359011381864548, "num_tokens": 28104167.0, "reward": 0.9440826177597046, "reward_std": 0.20472922921180725, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7542468309402466, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.827668309211731, "step": 120 }, { "adv/mean_abs_final_conf": 0.7143666744232178, "adv/mean_abs_reasoning": 0.517063319683075, "adv/mean_abs_step_conf": 0.7503951787948608, "adv/ratio_final_to_reasoning": 1.381584512436652, "adv/ratio_step_to_reasoning": 1.4512636078204166, "adv/std_final_conf": 0.8769662380218506, "adv/std_reasoning": 0.7576488256454468, "adv/std_step_conf": 0.9345428943634033, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7345043280470083, "calib/avg_num_step_conf": 6.125, "calib/ece": 0.23984126984126986, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5555555555555556, "calib/gap": 0.3350906678460858, "calib/mean_conf": 0.6763492063492064, "calib/mu_c": 0.8345864661654135, "calib/mu_w": 0.4994957983193277, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19420634920634922, "calib/std_conf": 0.4057241286035576, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4496797671033479, "calib/step_q_c_n": 687.0, "calib/step_q_gap": 0.12050837096259875, "calib/step_q_w": 0.32917139614074914, "calib/step_q_w_n": 881.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2652.0, "completions/max_terminated_length": 2652.0, "completions/mean_length": 560.47265625, "completions/mean_terminated_length": 564.8858032226562, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.12906666666666666, "grad_norm": 0.0363897942006588, "kl": 0.067901611328125, "learning_rate": 2.1944444444444445e-06, "loss": 0.0397, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031554028391838074, "mask/share_reasoning": 0.8472678661346436, "mask/share_step_conf": 0.11336560547351837, "num_tokens": 28352704.0, "reward": 0.933656632900238, "reward_std": 0.20358788967132568, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7196906208992004, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8468413352966309, "step": 121 }, { "adv/mean_abs_final_conf": 0.6004906296730042, "adv/mean_abs_reasoning": 0.376475065946579, "adv/mean_abs_step_conf": 0.7485677599906921, "adv/ratio_final_to_reasoning": 1.595034263858028, "adv/ratio_step_to_reasoning": 1.9883594630856969, "adv/std_final_conf": 0.8289437294006348, "adv/std_reasoning": 0.6612106561660767, "adv/std_step_conf": 0.9336329698562622, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8526247258771928, "calib/avg_num_step_conf": 6.37890625, "calib/ece": 0.169233870967742, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6895161290322581, "calib/gap": 0.5107017543859651, "calib/mean_conf": 0.7434274193548387, "calib/mu_c": 0.9411184210526317, "calib/mu_w": 0.4304166666666666, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1498790322580646, "calib/std_conf": 0.4027986103979277, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5285681818181818, "calib/step_q_c_n": 880.0, "calib/step_q_gap": 0.2499758843414221, "calib/step_q_w": 0.2785922974767597, "calib/step_q_w_n": 753.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2376.0, "completions/max_terminated_length": 2376.0, "completions/mean_length": 499.25, "completions/mean_terminated_length": 509.1952209472656, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.13013333333333332, "grad_norm": 0.03530493378639221, "kl": 0.07299041748046875, "learning_rate": 2.166666666666667e-06, "loss": -0.0542, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03175698220729828, "mask/share_reasoning": 0.8294092416763306, "mask/share_step_conf": 0.11930252611637115, "num_tokens": 28587856.0, "reward": 0.9849426746368408, "reward_std": 0.18427860736846924, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7999886274337769, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8566153049468994, "step": 122 }, { "adv/mean_abs_final_conf": 0.7172747850418091, "adv/mean_abs_reasoning": 0.535598635673523, "adv/mean_abs_step_conf": 0.7722024917602539, "adv/ratio_final_to_reasoning": 1.3392020391161485, "adv/ratio_step_to_reasoning": 1.4417558976586977, "adv/std_final_conf": 0.8936556577682495, "adv/std_reasoning": 0.7928531765937805, "adv/std_step_conf": 0.9344001412391663, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7689959294436906, "calib/avg_num_step_conf": 6.48828125, "calib/ece": 0.22217213114754114, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5860655737704918, "calib/gap": 0.3767801899592945, "calib/mean_conf": 0.6986475409836066, "calib/mu_c": 0.8685074626865672, "calib/mu_w": 0.4917272727272727, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1858196721311477, "calib/std_conf": 0.39190609075308214, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4737954239569313, "calib/step_q_c_n": 743.0, "calib/step_q_gap": 0.16402418212686593, "calib/step_q_w": 0.3097712418300654, "calib/step_q_w_n": 918.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2910.0, "completions/max_terminated_length": 2910.0, "completions/mean_length": 599.44921875, "completions/mean_terminated_length": 606.5573120117188, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.1312, "grad_norm": 0.042939551174640656, "kl": 0.05826568603515625, "learning_rate": 2.138888888888889e-06, "loss": -0.0356, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.029819615185260773, "mask/share_reasoning": 0.8451920747756958, "mask/share_step_conf": 0.11326956003904343, "num_tokens": 28846603.0, "reward": 0.9171556830406189, "reward_std": 0.22644327580928802, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7272871136665344, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8117117285728455, "step": 123 }, { "adv/mean_abs_final_conf": 0.5510287284851074, "adv/mean_abs_reasoning": 0.43313780426979065, "adv/mean_abs_step_conf": 0.7770897150039673, "adv/ratio_final_to_reasoning": 1.272178791722104, "adv/ratio_step_to_reasoning": 1.7940934902092676, "adv/std_final_conf": 0.7954535484313965, "adv/std_reasoning": 0.7204844951629639, "adv/std_step_conf": 0.9341284036636353, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7866544117647059, "calib/avg_num_step_conf": 5.9765625, "calib/ece": 0.15532, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.692, "calib/gap": 0.4443235294117646, "calib/mean_conf": 0.76964, "calib/mu_c": 0.9118235294117647, "calib/mu_w": 0.4675000000000001, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12248, "calib/std_conf": 0.3741586166320375, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.498728813559322, "calib/step_q_c_n": 944.0, "calib/step_q_gap": 0.13524758489038002, "calib/step_q_w": 0.363481228668942, "calib/step_q_w_n": 586.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2706.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 536.6171875, "completions/mean_terminated_length": 542.9802856445312, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.13226666666666667, "grad_norm": 0.03145952895283699, "kl": 0.065338134765625, "learning_rate": 2.1111111111111114e-06, "loss": -0.0023, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0311250202357769, "mask/share_reasoning": 0.8385068774223328, "mask/share_step_conf": 0.11864937096834183, "num_tokens": 29090793.0, "reward": 0.9903885722160339, "reward_std": 0.17230892181396484, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.8046581745147705, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8487750291824341, "step": 124 }, { "adv/mean_abs_final_conf": 0.6158657073974609, "adv/mean_abs_reasoning": 0.4719720482826233, "adv/mean_abs_step_conf": 0.7359480857849121, "adv/ratio_final_to_reasoning": 1.3048775020436638, "adv/ratio_step_to_reasoning": 1.559304387755218, "adv/std_final_conf": 0.8484397530555725, "adv/std_reasoning": 0.7394078373908997, "adv/std_step_conf": 0.9346246719360352, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6562948467058056, "calib/avg_num_step_conf": 5.60546875, "calib/ece": 0.2996414342629482, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6533864541832669, "calib/gap": 0.2253248532289629, "calib/mean_conf": 0.7264940239043826, "calib/mu_c": 0.8207534246575343, "calib/mu_w": 0.5954285714285714, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22223107569721112, "calib/std_conf": 0.3989124395483772, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4958344640434193, "calib/step_q_c_n": 737.0, "calib/step_q_gap": 0.09643618324112702, "calib/step_q_w": 0.3993982808022923, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2559.0, "completions/max_terminated_length": 2559.0, "completions/mean_length": 551.88671875, "completions/mean_terminated_length": 556.2322998046875, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.13333333333333333, "grad_norm": 0.029080787673592567, "kl": 0.065887451171875, "learning_rate": 2.0833333333333334e-06, "loss": -0.0555, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03245328366756439, "mask/share_reasoning": 0.8474924564361572, "mask/share_step_conf": 0.1122417151927948, "num_tokens": 29336884.0, "reward": 0.9050019383430481, "reward_std": 0.20497554540634155, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.672819972038269, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8270276784896851, "step": 125 }, { "adv/mean_abs_final_conf": 0.5700531005859375, "adv/mean_abs_reasoning": 0.4702516198158264, "adv/mean_abs_step_conf": 0.7739061117172241, "adv/ratio_final_to_reasoning": 1.2122299563990833, "adv/ratio_step_to_reasoning": 1.6457276894023751, "adv/std_final_conf": 0.7908639311790466, "adv/std_reasoning": 0.7206709384918213, "adv/std_step_conf": 0.9327467083930969, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8016865079365079, "calib/avg_num_step_conf": 5.65234375, "calib/ece": 0.20149797570850211, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5870445344129555, "calib/gap": 0.46720899470899474, "calib/mean_conf": 0.67, "calib/mu_c": 0.8818518518518519, "calib/mu_w": 0.41464285714285715, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16246963562753047, "calib/std_conf": 0.4209455329379382, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49954285714285723, "calib/step_q_c_n": 700.0, "calib/step_q_gap": 0.1847905144387073, "calib/step_q_w": 0.3147523427041499, "calib/step_q_w_n": 747.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2843.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 531.10546875, "completions/mean_terminated_length": 539.5357666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 68.0, "epoch": 0.1344, "grad_norm": 0.04010576009750366, "kl": 0.077484130859375, "learning_rate": 2.0555555555555555e-06, "loss": -0.0943, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.035097844898700714, "mask/share_reasoning": 0.8299784064292908, "mask/share_step_conf": 0.11929875612258911, "num_tokens": 29578311.0, "reward": 0.9459041357040405, "reward_std": 0.17431926727294922, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7634941339492798, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8298766613006592, "step": 126 }, { "adv/mean_abs_final_conf": 0.5844104886054993, "adv/mean_abs_reasoning": 0.42019808292388916, "adv/mean_abs_step_conf": 0.7377088069915771, "adv/ratio_final_to_reasoning": 1.3907976079732711, "adv/ratio_step_to_reasoning": 1.7556215436737226, "adv/std_final_conf": 0.8205244541168213, "adv/std_reasoning": 0.7013723850250244, "adv/std_step_conf": 0.9340208768844604, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7519230769230769, "calib/avg_num_step_conf": 6.1484375, "calib/ece": 0.2734959349593497, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6097560975609756, "calib/gap": 0.32465119363395234, "calib/mean_conf": 0.6983739837398374, "calib/mu_c": 0.8514615384615385, "calib/mu_w": 0.5268103448275862, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22170731707317082, "calib/std_conf": 0.41161959307648976, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5116850828729281, "calib/step_q_c_n": 724.0, "calib/step_q_gap": 0.2000262593435163, "calib/step_q_w": 0.3116588235294118, "calib/step_q_w_n": 850.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2754.0, "completions/max_terminated_length": 2754.0, "completions/mean_length": 515.87890625, "completions/mean_terminated_length": 524.0675048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.13546666666666668, "grad_norm": 0.031249074265360832, "kl": 0.06769561767578125, "learning_rate": 2.027777777777778e-06, "loss": -0.0101, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.033468492329120636, "mask/share_reasoning": 0.8253788948059082, "mask/share_step_conf": 0.12552762031555176, "num_tokens": 29814048.0, "reward": 0.901087760925293, "reward_std": 0.19262036681175232, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6864038705825806, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8220216631889343, "step": 127 }, { "adv/mean_abs_final_conf": 0.6289718747138977, "adv/mean_abs_reasoning": 0.5378081798553467, "adv/mean_abs_step_conf": 0.7337651252746582, "adv/ratio_final_to_reasoning": 1.169509684443757, "adv/ratio_step_to_reasoning": 1.3643621513380806, "adv/std_final_conf": 0.8287432789802551, "adv/std_reasoning": 0.775564432144165, "adv/std_step_conf": 0.9350315928459167, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7391482391482391, "calib/avg_num_step_conf": 5.26171875, "calib/ece": 0.2504526748971194, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5967078189300411, "calib/gap": 0.3706777231777232, "calib/mean_conf": 0.6675720164609054, "calib/mu_c": 0.8368939393939394, "calib/mu_w": 0.46621621621621623, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.1874074074074075, "calib/std_conf": 0.43235248674798726, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.5065650080256823, "calib/step_q_c_n": 623.0, "calib/step_q_gap": 0.19077771520800274, "calib/step_q_w": 0.31578729281767953, "calib/step_q_w_n": 724.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 509.5859375, "completions/mean_terminated_length": 521.8160400390625, "completions/min_length": 0.0, "completions/min_terminated_length": 83.0, "epoch": 0.13653333333333334, "grad_norm": 0.05047476291656494, "kl": 0.07059478759765625, "learning_rate": 2.0000000000000003e-06, "loss": -0.1925, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03439555689692497, "mask/share_reasoning": 0.8320193290710449, "mask/share_step_conf": 0.11014766991138458, "num_tokens": 30051166.0, "reward": 0.8882846832275391, "reward_std": 0.2592315077781677, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6846984624862671, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.8020271062850952, "step": 128 }, { "adv/mean_abs_final_conf": 0.5699493885040283, "adv/mean_abs_reasoning": 0.3200322091579437, "adv/mean_abs_step_conf": 0.7701667547225952, "adv/ratio_final_to_reasoning": 1.7809125837791668, "adv/ratio_step_to_reasoning": 2.406528882667866, "adv/std_final_conf": 0.8245267868041992, "adv/std_reasoning": 0.6400149464607239, "adv/std_step_conf": 0.9331631064414978, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7107128113389295, "calib/avg_num_step_conf": 5.328125, "calib/ece": 0.22188235294117642, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7372549019607844, "calib/gap": 0.277128113389294, "calib/mean_conf": 0.8085490196078432, "calib/mu_c": 0.9020118343195266, "calib/mu_w": 0.6248837209302326, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18384313725490192, "calib/std_conf": 0.3402211687645397, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5089479512735327, "calib/step_q_c_n": 903.0, "calib/step_q_gap": 0.07988070615422677, "calib/step_q_w": 0.4290672451193059, "calib/step_q_w_n": 461.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 449.76953125, "completions/mean_terminated_length": 451.5333557128906, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.1376, "grad_norm": 0.03063477762043476, "kl": 0.07349395751953125, "learning_rate": 1.9722222222222224e-06, "loss": -0.0181, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03671756386756897, "mask/share_reasoning": 0.8324686288833618, "mask/share_step_conf": 0.12690752744674683, "num_tokens": 30268691.0, "reward": 0.9700326919555664, "reward_std": 0.16116727888584137, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.760378897190094, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8484363555908203, "step": 129 }, { "adv/mean_abs_final_conf": 0.5980579853057861, "adv/mean_abs_reasoning": 0.27735209465026855, "adv/mean_abs_step_conf": 0.7554200887680054, "adv/ratio_final_to_reasoning": 2.156313209243711, "adv/ratio_step_to_reasoning": 2.72368625778927, "adv/std_final_conf": 0.8363456130027771, "adv/std_reasoning": 0.5726004242897034, "adv/std_step_conf": 0.9338682293891907, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7849537685626227, "calib/avg_num_step_conf": 4.9140625, "calib/ece": 0.16988095238095235, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7023809523809523, "calib/gap": 0.480421686746988, "calib/mean_conf": 0.741468253968254, "calib/mu_c": 0.905421686746988, "calib/mu_w": 0.425, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12630952380952382, "calib/std_conf": 0.4020747617938168, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5338902743142144, "calib/step_q_c_n": 802.0, "calib/step_q_gap": 0.09031571291070573, "calib/step_q_w": 0.4435745614035087, "calib/step_q_w_n": 456.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 450.4921875, "completions/mean_terminated_length": 457.64288330078125, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.13866666666666666, "grad_norm": 0.06043770909309387, "kl": 0.0726165771484375, "learning_rate": 1.944444444444445e-06, "loss": -0.0455, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.037092648446559906, "mask/share_reasoning": 0.8293949365615845, "mask/share_step_conf": 0.11788740009069443, "num_tokens": 30489305.0, "reward": 0.9999578595161438, "reward_std": 0.1384398490190506, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.8098331689834595, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8635199069976807, "step": 130 }, { "adv/mean_abs_final_conf": 0.644719123840332, "adv/mean_abs_reasoning": 0.30147671699523926, "adv/mean_abs_step_conf": 0.7534003257751465, "adv/ratio_final_to_reasoning": 2.1385370328631814, "adv/ratio_step_to_reasoning": 2.499033203240845, "adv/std_final_conf": 0.8671888113021851, "adv/std_reasoning": 0.5959498882293701, "adv/std_step_conf": 0.9341416954994202, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8221227621483376, "calib/avg_num_step_conf": 5.1953125, "calib/ece": 0.21262948207171306, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.49800796812749004, "calib/gap": 0.4851259590792839, "calib/mean_conf": 0.5955776892430279, "calib/mu_c": 0.8584347826086957, "calib/mu_w": 0.37330882352941175, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17501992031872504, "calib/std_conf": 0.4372186231651954, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.553898916967509, "calib/step_q_c_n": 554.0, "calib/step_q_gap": 0.2200973705757565, "calib/step_q_w": 0.3338015463917525, "calib/step_q_w_n": 776.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2524.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 483.109375, "completions/mean_terminated_length": 488.83795166015625, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.13973333333333332, "grad_norm": 0.03974044695496559, "kl": 0.07405853271484375, "learning_rate": 1.916666666666667e-06, "loss": -0.1017, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034373268485069275, "mask/share_reasoning": 0.8387865424156189, "mask/share_step_conf": 0.11512146890163422, "num_tokens": 30719189.0, "reward": 0.9495083093643188, "reward_std": 0.16720762848854065, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.7672886848449707, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8457905054092407, "step": 131 }, { "adv/mean_abs_final_conf": 0.6128841638565063, "adv/mean_abs_reasoning": 0.540101945400238, "adv/mean_abs_step_conf": 0.7529127597808838, "adv/ratio_final_to_reasoning": 1.1347564456601498, "adv/ratio_step_to_reasoning": 1.3940197145984061, "adv/std_final_conf": 0.8160517811775208, "adv/std_reasoning": 0.7577255964279175, "adv/std_step_conf": 0.9345849752426147, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7750711478520125, "calib/avg_num_step_conf": 5.6796875, "calib/ece": 0.21195219123505982, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6414342629482072, "calib/gap": 0.38740547499661204, "calib/mean_conf": 0.7143426294820717, "calib/mu_c": 0.8594267515923567, "calib/mu_w": 0.47202127659574467, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15039840637450202, "calib/std_conf": 0.40855574277647305, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.546679292929293, "calib/step_q_c_n": 792.0, "calib/step_q_gap": 0.17696630199273705, "calib/step_q_w": 0.3697129909365559, "calib/step_q_w_n": 662.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2861.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 516.65625, "completions/mean_terminated_length": 520.7244262695312, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.1408, "grad_norm": 0.030295290052890778, "kl": 0.066131591796875, "learning_rate": 1.888888888888889e-06, "loss": 0.0593, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03516693413257599, "mask/share_reasoning": 0.8338392972946167, "mask/share_step_conf": 0.12318122386932373, "num_tokens": 30957045.0, "reward": 0.9622828364372253, "reward_std": 0.20570440590381622, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7573515772819519, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8476827144622803, "step": 132 }, { "adv/mean_abs_final_conf": 0.6930422782897949, "adv/mean_abs_reasoning": 0.6090747117996216, "adv/mean_abs_step_conf": 0.7614809274673462, "adv/ratio_final_to_reasoning": 1.1378608647896018, "adv/ratio_step_to_reasoning": 1.250225814198414, "adv/std_final_conf": 0.8737168312072754, "adv/std_reasoning": 0.826562762260437, "adv/std_step_conf": 0.9344897866249084, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.775923016496465, "calib/avg_num_step_conf": 6.046875, "calib/ece": 0.25278225806451615, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5362903225806451, "calib/gap": 0.3981657501963865, "calib/mean_conf": 0.6040725806451613, "calib/mu_c": 0.8192105263157895, "calib/mu_w": 0.421044776119403, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19858870967741937, "calib/std_conf": 0.4442454035811937, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5027538461538461, "calib/step_q_c_n": 650.0, "calib/step_q_gap": 0.16200774370395743, "calib/step_q_w": 0.3407461024498887, "calib/step_q_w_n": 898.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2803.0, "completions/max_terminated_length": 2803.0, "completions/mean_length": 598.63671875, "completions/mean_terminated_length": 603.3504028320312, "completions/min_length": 0.0, "completions/min_terminated_length": 55.0, "epoch": 0.14186666666666667, "grad_norm": 0.04612388089299202, "kl": 0.0608673095703125, "learning_rate": 1.8611111111111113e-06, "loss": -0.0083, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02898087352514267, "mask/share_reasoning": 0.853644609451294, "mask/share_step_conf": 0.10956203192472458, "num_tokens": 31216640.0, "reward": 0.9158010482788086, "reward_std": 0.23849625885486603, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.7083597183227539, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8404297828674316, "step": 133 }, { "adv/mean_abs_final_conf": 0.7420611381530762, "adv/mean_abs_reasoning": 0.6442513465881348, "adv/mean_abs_step_conf": 0.7690349817276001, "adv/ratio_final_to_reasoning": 1.1518193048146945, "adv/ratio_step_to_reasoning": 1.1936878142363256, "adv/std_final_conf": 0.8912635445594788, "adv/std_reasoning": 0.8429958820343018, "adv/std_step_conf": 0.9348194003105164, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7227853099907858, "calib/avg_num_step_conf": 5.8671875, "calib/ece": 0.2561044176706827, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.606425702811245, "calib/gap": 0.3689844675529814, "calib/mean_conf": 0.6691164658634536, "calib/mu_c": 0.8276760563380282, "calib/mu_w": 0.4586915887850468, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17746987951807228, "calib/std_conf": 0.43159555300705044, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4647212336892052, "calib/step_q_c_n": 843.0, "calib/step_q_gap": 0.1383175918075663, "calib/step_q_w": 0.3264036418816389, "calib/step_q_w_n": 659.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2848.0, "completions/max_terminated_length": 2848.0, "completions/mean_length": 574.48046875, "completions/mean_terminated_length": 581.2925415039062, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.14293333333333333, "grad_norm": 0.03696267306804657, "kl": 0.05755615234375, "learning_rate": 1.8333333333333333e-06, "loss": -0.1123, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.029521383345127106, "mask/share_reasoning": 0.8535020351409912, "mask/share_step_conf": 0.10525783896446228, "num_tokens": 31472659.0, "reward": 0.9212626218795776, "reward_std": 0.26008397340774536, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7156097292900085, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8222278356552124, "step": 134 }, { "adv/mean_abs_final_conf": 0.617477536201477, "adv/mean_abs_reasoning": 0.4812614321708679, "adv/mean_abs_step_conf": 0.7539912462234497, "adv/ratio_final_to_reasoning": 1.2830397262796798, "adv/ratio_step_to_reasoning": 1.5666978399294447, "adv/std_final_conf": 0.8101888298988342, "adv/std_reasoning": 0.7393453121185303, "adv/std_step_conf": 0.9331625699996948, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7016129032258065, "calib/avg_num_step_conf": 5.84765625, "calib/ece": 0.23706827309236939, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6265060240963856, "calib/gap": 0.34203267162944595, "calib/mean_conf": 0.713855421686747, "calib/mu_c": 0.8416025641025643, "calib/mu_w": 0.49956989247311834, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16220883534136535, "calib/std_conf": 0.40319655881577965, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48586248492159223, "calib/step_q_c_n": 829.0, "calib/step_q_gap": 0.12367685617907725, "calib/step_q_w": 0.362185628742515, "calib/step_q_w_n": 668.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2690.0, "completions/max_terminated_length": 2690.0, "completions/mean_length": 527.8671875, "completions/mean_terminated_length": 534.1265258789062, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.144, "grad_norm": 0.04049808531999588, "kl": 0.06581878662109375, "learning_rate": 1.8055555555555557e-06, "loss": -0.0241, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03396621719002724, "mask/share_reasoning": 0.8378646969795227, "mask/share_step_conf": 0.11645033210515976, "num_tokens": 31713673.0, "reward": 0.9581298828125, "reward_std": 0.1770906001329422, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7313003540039062, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.86933434009552, "step": 135 }, { "adv/mean_abs_final_conf": 0.6305532455444336, "adv/mean_abs_reasoning": 0.45713546872138977, "adv/mean_abs_step_conf": 0.7678828835487366, "adv/ratio_final_to_reasoning": 1.3793575180418491, "adv/ratio_step_to_reasoning": 1.6797709565099135, "adv/std_final_conf": 0.8364690542221069, "adv/std_reasoning": 0.7206043004989624, "adv/std_step_conf": 0.9332603812217712, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7863091806009354, "calib/avg_num_step_conf": 6.26953125, "calib/ece": 0.23220000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.512, "calib/gap": 0.4209782817605227, "calib/mean_conf": 0.5798, "calib/mu_c": 0.7970247933884297, "calib/mu_w": 0.376046511627907, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.164, "calib/std_conf": 0.4506994120253542, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4941863905325444, "calib/step_q_c_n": 676.0, "calib/step_q_gap": 0.2114737963452462, "calib/step_q_w": 0.28271259418729816, "calib/step_q_w_n": 929.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2788.0, "completions/max_terminated_length": 2788.0, "completions/mean_length": 545.4140625, "completions/mean_terminated_length": 551.8814697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.14506666666666668, "grad_norm": 0.04689570143818855, "kl": 0.0675506591796875, "learning_rate": 1.777777777777778e-06, "loss": -0.0301, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03165838494896889, "mask/share_reasoning": 0.8355990648269653, "mask/share_step_conf": 0.12102382630109787, "num_tokens": 31961787.0, "reward": 0.9410368204116821, "reward_std": 0.18190747499465942, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7306855320930481, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8615442514419556, "step": 136 }, { "adv/mean_abs_final_conf": 0.6028730869293213, "adv/mean_abs_reasoning": 0.43961775302886963, "adv/mean_abs_step_conf": 0.7608221769332886, "adv/ratio_final_to_reasoning": 1.3713574640142676, "adv/ratio_step_to_reasoning": 1.7306447969659804, "adv/std_final_conf": 0.8279385566711426, "adv/std_reasoning": 0.7205258011817932, "adv/std_step_conf": 0.933806300163269, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.831116158338741, "calib/avg_num_step_conf": 6.3203125, "calib/ece": 0.1833734939759037, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5502008032128514, "calib/gap": 0.47973588578844906, "calib/mean_conf": 0.6558232931726908, "calib/mu_c": 0.8773880597014926, "calib/mu_w": 0.3976521739130435, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15052208835341374, "calib/std_conf": 0.4197037405347656, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49825970548862114, "calib/step_q_c_n": 747.0, "calib/step_q_gap": 0.18530907403052693, "calib/step_q_w": 0.3129506314580942, "calib/step_q_w_n": 871.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2792.0, "completions/max_terminated_length": 2792.0, "completions/mean_length": 515.35546875, "completions/mean_terminated_length": 521.4664306640625, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.14613333333333334, "grad_norm": 0.03188343718647957, "kl": 0.07834625244140625, "learning_rate": 1.75e-06, "loss": -0.0948, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03322169929742813, "mask/share_reasoning": 0.8309173583984375, "mask/share_step_conf": 0.12414221465587616, "num_tokens": 32200702.0, "reward": 0.9673022031784058, "reward_std": 0.18526697158813477, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.778056263923645, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8573293685913086, "step": 137 }, { "adv/mean_abs_final_conf": 0.6043561697006226, "adv/mean_abs_reasoning": 0.4610038101673126, "adv/mean_abs_step_conf": 0.7497239112854004, "adv/ratio_final_to_reasoning": 1.3109569950870534, "adv/ratio_step_to_reasoning": 1.6262857155417052, "adv/std_final_conf": 0.8366493582725525, "adv/std_reasoning": 0.7207646369934082, "adv/std_step_conf": 0.9337040185928345, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7530487804878049, "calib/avg_num_step_conf": 6.171875, "calib/ece": 0.23643999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.628, "calib/gap": 0.332033465683494, "calib/mean_conf": 0.717, "calib/mu_c": 0.8312195121951219, "calib/mu_w": 0.4991860465116279, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14872000000000002, "calib/std_conf": 0.4039220221775485, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4687580993520518, "calib/step_q_c_n": 926.0, "calib/step_q_gap": 0.1361128394132139, "calib/step_q_w": 0.3326452599388379, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1269.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 494.53125, "completions/mean_terminated_length": 498.4252014160156, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.1472, "grad_norm": 0.04823905602097511, "kl": 0.07939910888671875, "learning_rate": 1.7222222222222224e-06, "loss": -0.1478, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03357618302106857, "mask/share_reasoning": 0.8358300924301147, "mask/share_step_conf": 0.12278124690055847, "num_tokens": 32431638.0, "reward": 0.9525300860404968, "reward_std": 0.2015206217765808, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7395683526992798, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8420543670654297, "step": 138 }, { "adv/mean_abs_final_conf": 0.6356931924819946, "adv/mean_abs_reasoning": 0.48741334676742554, "adv/mean_abs_step_conf": 0.7561699151992798, "adv/ratio_final_to_reasoning": 1.304217860873068, "adv/ratio_step_to_reasoning": 1.551393535310994, "adv/std_final_conf": 0.8453860282897949, "adv/std_reasoning": 0.7575779557228088, "adv/std_step_conf": 0.9334150552749634, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8276442307692307, "calib/avg_num_step_conf": 5.5, "calib/ece": 0.19976095617529877, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6374501992031872, "calib/gap": 0.44426442307692326, "calib/mean_conf": 0.7101195219123506, "calib/mu_c": 0.8711875000000001, "calib/mu_w": 0.42692307692307685, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.136215139442231, "calib/std_conf": 0.40911768056534276, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5002891566265061, "calib/step_q_c_n": 830.0, "calib/step_q_gap": 0.1729535164880978, "calib/step_q_w": 0.32733564013840827, "calib/step_q_w_n": 578.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2796.0, "completions/max_terminated_length": 2796.0, "completions/mean_length": 474.73046875, "completions/mean_terminated_length": 480.3597106933594, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.14826666666666666, "grad_norm": 0.03851529210805893, "kl": 0.06740570068359375, "learning_rate": 1.6944444444444446e-06, "loss": -0.063, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03441895917057991, "mask/share_reasoning": 0.8336876034736633, "mask/share_step_conf": 0.12017463147640228, "num_tokens": 32656265.0, "reward": 0.9894264936447144, "reward_std": 0.18618866801261902, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7859241962432861, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8718349933624268, "step": 139 }, { "adv/mean_abs_final_conf": 0.5681322813034058, "adv/mean_abs_reasoning": 0.41635167598724365, "adv/mean_abs_step_conf": 0.7558364272117615, "adv/ratio_final_to_reasoning": 1.364549043681074, "adv/ratio_step_to_reasoning": 1.8153798118370468, "adv/std_final_conf": 0.8017783761024475, "adv/std_reasoning": 0.7013971209526062, "adv/std_step_conf": 0.9335141777992249, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7918732782369146, "calib/avg_num_step_conf": 5.4296875, "calib/ece": 0.1861264822134387, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.691699604743083, "calib/gap": 0.39036363636363636, "calib/mean_conf": 0.7545849802371543, "calib/mu_c": 0.8903636363636364, "calib/mu_w": 0.5, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.14426877470355728, "calib/std_conf": 0.38078596388028896, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5402469135802469, "calib/step_q_c_n": 891.0, "calib/step_q_gap": 0.19100843662633904, "calib/step_q_w": 0.34923847695390786, "calib/step_q_w_n": 499.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2622.0, "completions/max_terminated_length": 2622.0, "completions/mean_length": 501.6171875, "completions/mean_terminated_length": 503.5843505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.14933333333333335, "grad_norm": 0.04033830761909485, "kl": 0.06805419921875, "learning_rate": 1.6666666666666667e-06, "loss": 0.0355, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0332154780626297, "mask/share_reasoning": 0.8421204090118408, "mask/share_step_conf": 0.1207578182220459, "num_tokens": 32889695.0, "reward": 0.9933497309684753, "reward_std": 0.16882237792015076, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7854597568511963, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8746772408485413, "step": 140 }, { "adv/mean_abs_final_conf": 0.5361478924751282, "adv/mean_abs_reasoning": 0.3555518388748169, "adv/mean_abs_step_conf": 0.7517731785774231, "adv/ratio_final_to_reasoning": 1.5079317102446368, "adv/ratio_step_to_reasoning": 2.1143841667546774, "adv/std_final_conf": 0.7747222781181335, "adv/std_reasoning": 0.640269935131073, "adv/std_step_conf": 0.9338180422782898, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8649622022536014, "calib/avg_num_step_conf": 5.93359375, "calib/ece": 0.1573122529644269, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6758893280632411, "calib/gap": 0.5345556981885607, "calib/mean_conf": 0.7171541501976284, "calib/mu_c": 0.8904093567251461, "calib/mu_w": 0.35585365853658535, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09928853754940715, "calib/std_conf": 0.41866884376992447, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49948293691830403, "calib/step_q_c_n": 967.0, "calib/step_q_gap": 0.1837401832951156, "calib/step_q_w": 0.31574275362318843, "calib/step_q_w_n": 552.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1824.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 539.8671875, "completions/mean_terminated_length": 544.1181030273438, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.1504, "grad_norm": 0.04087727516889572, "kl": 0.06522369384765625, "learning_rate": 1.638888888888889e-06, "loss": -0.0535, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03152162581682205, "mask/share_reasoning": 0.8432751893997192, "mask/share_step_conf": 0.11739066243171692, "num_tokens": 33134997.0, "reward": 1.0191978216171265, "reward_std": 0.1478888988494873, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.8283312320709229, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8795955181121826, "step": 141 }, { "adv/mean_abs_final_conf": 0.5493594408035278, "adv/mean_abs_reasoning": 0.35145801305770874, "adv/mean_abs_step_conf": 0.7668730020523071, "adv/ratio_final_to_reasoning": 1.563086970258732, "adv/ratio_step_to_reasoning": 2.1819761495276766, "adv/std_final_conf": 0.7770100831985474, "adv/std_reasoning": 0.6403212547302246, "adv/std_step_conf": 0.9339790344238281, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.833258137286497, "calib/avg_num_step_conf": 6.30859375, "calib/ece": 0.20769841269841277, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5714285714285714, "calib/gap": 0.4770944247502418, "calib/mean_conf": 0.6350793650793651, "calib/mu_c": 0.8376551724137932, "calib/mu_w": 0.3605607476635514, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1336904761904763, "calib/std_conf": 0.44481804025835037, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5124548736462095, "calib/step_q_c_n": 831.0, "calib/step_q_gap": 0.19458497568702576, "calib/step_q_w": 0.3178698979591837, "calib/step_q_w_n": 784.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2385.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 548.44140625, "completions/mean_terminated_length": 554.9447021484375, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.15146666666666667, "grad_norm": 0.0574471652507782, "kl": 0.06587982177734375, "learning_rate": 1.6111111111111113e-06, "loss": -0.0563, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030428925529122353, "mask/share_reasoning": 0.8384536504745483, "mask/share_step_conf": 0.11939871311187744, "num_tokens": 33380558.0, "reward": 0.9693418741226196, "reward_std": 0.16276447474956512, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7750797271728516, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.853447675704956, "step": 142 }, { "adv/mean_abs_final_conf": 0.6312240362167358, "adv/mean_abs_reasoning": 0.5305913686752319, "adv/mean_abs_step_conf": 0.7272872924804688, "adv/ratio_final_to_reasoning": 1.189661335412902, "adv/ratio_step_to_reasoning": 1.3707107492086472, "adv/std_final_conf": 0.8579843640327454, "adv/std_reasoning": 0.7754026055335999, "adv/std_step_conf": 0.9342260360717773, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8851172404883405, "calib/avg_num_step_conf": 5.9765625, "calib/ece": 0.13715999999999995, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.52, "calib/gap": 0.5812705897551838, "calib/mean_conf": 0.6042000000000001, "calib/mu_c": 0.866934306569343, "calib/mu_w": 0.2856637168141593, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09667999999999996, "calib/std_conf": 0.43694892149998493, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.48222695035460994, "calib/step_q_c_n": 705.0, "calib/step_q_gap": 0.20965725338491298, "calib/step_q_w": 0.27256969696969696, "calib/step_q_w_n": 825.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2965.0, "completions/max_terminated_length": 2965.0, "completions/mean_length": 567.5703125, "completions/mean_terminated_length": 569.796142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.15253333333333333, "grad_norm": 0.03865446522831917, "kl": 0.06455230712890625, "learning_rate": 1.5833333333333333e-06, "loss": 0.0308, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.030564136803150177, "mask/share_reasoning": 0.847449541091919, "mask/share_step_conf": 0.11808009445667267, "num_tokens": 33633192.0, "reward": 0.9973582029342651, "reward_std": 0.20891296863555908, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.8224399089813232, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8707141280174255, "step": 143 }, { "adv/mean_abs_final_conf": 0.6034372448921204, "adv/mean_abs_reasoning": 0.48929351568222046, "adv/mean_abs_step_conf": 0.7381302118301392, "adv/ratio_final_to_reasoning": 1.2332827342924209, "adv/ratio_step_to_reasoning": 1.5085632410251062, "adv/std_final_conf": 0.8320863246917725, "adv/std_reasoning": 0.75753253698349, "adv/std_step_conf": 0.9336531758308411, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7241117522016398, "calib/avg_num_step_conf": 5.62109375, "calib/ece": 0.25031746031746027, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5634920634920635, "calib/gap": 0.38860917096872155, "calib/mean_conf": 0.6234126984126984, "calib/mu_c": 0.7375280898876404, "calib/mu_w": 0.34891891891891885, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.08369047619047619, "calib/std_conf": 0.4471473296822033, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.47799794661190964, "calib/step_q_c_n": 974.0, "calib/step_q_gap": 0.16720224768717845, "calib/step_q_w": 0.3107956989247312, "calib/step_q_w_n": 465.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1985.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 508.921875, "completions/mean_terminated_length": 510.91766357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 71.0, "epoch": 0.1536, "grad_norm": 0.0469198077917099, "kl": 0.096343994140625, "learning_rate": 1.5555555555555558e-06, "loss": 0.0299, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03401659429073334, "mask/share_reasoning": 0.8417648077011108, "mask/share_step_conf": 0.1203123927116394, "num_tokens": 33867604.0, "reward": 0.9624280333518982, "reward_std": 0.16957436501979828, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7329937219619751, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8567060828208923, "step": 144 }, { "adv/mean_abs_final_conf": 0.6340999007225037, "adv/mean_abs_reasoning": 0.4605696201324463, "adv/mean_abs_step_conf": 0.7574340105056763, "adv/ratio_final_to_reasoning": 1.3767731804372054, "adv/ratio_step_to_reasoning": 1.6445592097191746, "adv/std_final_conf": 0.8457151055335999, "adv/std_reasoning": 0.7392440438270569, "adv/std_step_conf": 0.9339003562927246, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7269218651543794, "calib/avg_num_step_conf": 6.28515625, "calib/ece": 0.24047430830039523, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5573122529644269, "calib/gap": 0.3170833333333332, "calib/mean_conf": 0.6568379446640317, "calib/mu_c": 0.7433152173913042, "calib/mu_w": 0.426231884057971, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.08501976284584978, "calib/std_conf": 0.4133133751405631, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4666238532110092, "calib/step_q_c_n": 1090.0, "calib/step_q_gap": 0.12361807286418836, "calib/step_q_w": 0.34300578034682083, "calib/step_q_w_n": 519.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 491.21484375, "completions/mean_terminated_length": 495.0826721191406, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.15466666666666667, "grad_norm": 0.05354519188404083, "kl": 0.06645965576171875, "learning_rate": 1.527777777777778e-06, "loss": -0.0135, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.034240782260894775, "mask/share_reasoning": 0.8234318494796753, "mask/share_step_conf": 0.1345149278640747, "num_tokens": 34096059.0, "reward": 0.9719904661178589, "reward_std": 0.17904748022556305, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7428406476974487, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.859734058380127, "step": 145 }, { "adv/mean_abs_final_conf": 0.6603371500968933, "adv/mean_abs_reasoning": 0.4713112711906433, "adv/mean_abs_step_conf": 0.7333450317382812, "adv/ratio_final_to_reasoning": 1.401063777721093, "adv/ratio_step_to_reasoning": 1.5559675241495476, "adv/std_final_conf": 0.8613228797912598, "adv/std_reasoning": 0.7393918037414551, "adv/std_step_conf": 0.9345128536224365, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7508116883116884, "calib/avg_num_step_conf": 5.953125, "calib/ece": 0.23447999999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.4116038961038961, "calib/mean_conf": 0.5883200000000001, "calib/mu_c": 0.8188181818181818, "calib/mu_w": 0.4072142857142857, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.19139999999999996, "calib/std_conf": 0.43612702002971565, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4818808777429467, "calib/step_q_c_n": 638.0, "calib/step_q_gap": 0.15795311250592636, "calib/step_q_w": 0.3239277652370203, "calib/step_q_w_n": 886.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 535.0390625, "completions/mean_terminated_length": 541.3834228515625, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.15573333333333333, "grad_norm": 0.04511573165655136, "kl": 0.0630950927734375, "learning_rate": 1.5e-06, "loss": -0.0597, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.031575046479701996, "mask/share_reasoning": 0.8401821851730347, "mask/share_step_conf": 0.11652399599552155, "num_tokens": 34340245.0, "reward": 0.9193023443222046, "reward_std": 0.20384211838245392, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.7231277227401733, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8350081443786621, "step": 146 }, { "adv/mean_abs_final_conf": 0.5824941992759705, "adv/mean_abs_reasoning": 0.3738293945789337, "adv/mean_abs_step_conf": 0.7618895173072815, "adv/ratio_final_to_reasoning": 1.5581819079049906, "adv/ratio_step_to_reasoning": 2.0380674402702947, "adv/std_final_conf": 0.8140949606895447, "adv/std_reasoning": 0.6403229832649231, "adv/std_step_conf": 0.9339715242385864, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7655609631147541, "calib/avg_num_step_conf": 6.17578125, "calib/ece": 0.27816, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.588, "calib/gap": 0.3858299180327869, "calib/mean_conf": 0.64016, "calib/mu_c": 0.8377049180327869, "calib/mu_w": 0.451875, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21516000000000002, "calib/std_conf": 0.4442692588959988, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5155219780219781, "calib/step_q_c_n": 728.0, "calib/step_q_gap": 0.17739771073006716, "calib/step_q_w": 0.3381242672919109, "calib/step_q_w_n": 853.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2620.0, "completions/max_terminated_length": 2620.0, "completions/mean_length": 548.27734375, "completions/mean_terminated_length": 554.7786865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.1568, "grad_norm": 0.04079528525471687, "kl": 0.0682525634765625, "learning_rate": 1.4722222222222225e-06, "loss": -0.0792, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.030725400894880295, "mask/share_reasoning": 0.8386132121086121, "mask/share_step_conf": 0.11894263327121735, "num_tokens": 34584284.0, "reward": 0.9176037311553955, "reward_std": 0.18044179677963257, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7054883241653442, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8390940427780151, "step": 147 }, { "adv/mean_abs_final_conf": 0.5602353811264038, "adv/mean_abs_reasoning": 0.43357449769973755, "adv/mean_abs_step_conf": 0.7547708749771118, "adv/ratio_final_to_reasoning": 1.2921317653566942, "adv/ratio_step_to_reasoning": 1.7408101237075335, "adv/std_final_conf": 0.8055301308631897, "adv/std_reasoning": 0.7204562425613403, "adv/std_step_conf": 0.9323009848594666, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8212683924777715, "calib/avg_num_step_conf": 5.703125, "calib/ece": 0.17796000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.692, "calib/gap": 0.45802423479424037, "calib/mean_conf": 0.73372, "calib/mu_c": 0.8637988826815642, "calib/mu_w": 0.40577464788732387, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09784000000000004, "calib/std_conf": 0.40944274520377083, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5178674948240165, "calib/step_q_c_n": 966.0, "calib/step_q_gap": 0.18732093611956302, "calib/step_q_w": 0.3305465587044535, "calib/step_q_w_n": 494.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2539.0, "completions/max_terminated_length": 2539.0, "completions/mean_length": 519.296875, "completions/mean_terminated_length": 521.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 83.0, "epoch": 0.15786666666666666, "grad_norm": 0.056484851986169815, "kl": 0.07324981689453125, "learning_rate": 1.4444444444444445e-06, "loss": -0.0459, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03550893813371658, "mask/share_reasoning": 0.8370537161827087, "mask/share_step_conf": 0.12353110313415527, "num_tokens": 34822336.0, "reward": 0.9942589998245239, "reward_std": 0.1536797285079956, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7958706617355347, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8574910163879395, "step": 148 }, { "adv/mean_abs_final_conf": 0.5460543632507324, "adv/mean_abs_reasoning": 0.4494268000125885, "adv/mean_abs_step_conf": 0.7579556107521057, "adv/ratio_final_to_reasoning": 1.2150017827940776, "adv/ratio_step_to_reasoning": 1.6864940202294907, "adv/std_final_conf": 0.7671118378639221, "adv/std_reasoning": 0.681749701499939, "adv/std_step_conf": 0.9289894700050354, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8942547315956085, "calib/avg_num_step_conf": 6.14453125, "calib/ece": 0.1483870967741935, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6330645161290323, "calib/gap": 0.5755102040816326, "calib/mean_conf": 0.7011290322580644, "calib/mu_c": 0.9355102040816325, "calib/mu_w": 0.36, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12838709677419352, "calib/std_conf": 0.41539664231605267, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5407525510204082, "calib/step_q_c_n": 784.0, "calib/step_q_gap": 0.2292443127441091, "calib/step_q_w": 0.3115082382762991, "calib/step_q_w_n": 789.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2963.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 560.36328125, "completions/mean_terminated_length": 564.7755737304688, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.15893333333333334, "grad_norm": 0.04593983665108681, "kl": 0.0635223388671875, "learning_rate": 1.4166666666666667e-06, "loss": 0.0619, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03306441754102707, "mask/share_reasoning": 0.83942711353302, "mask/share_step_conf": 0.11969595402479172, "num_tokens": 35070245.0, "reward": 1.0043818950653076, "reward_std": 0.1608709990978241, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.8255242109298706, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8746458292007446, "step": 149 }, { "adv/mean_abs_final_conf": 0.5928109884262085, "adv/mean_abs_reasoning": 0.5092207193374634, "adv/mean_abs_step_conf": 0.7641928791999817, "adv/ratio_final_to_reasoning": 1.1641533148877028, "adv/ratio_step_to_reasoning": 1.50071049778622, "adv/std_final_conf": 0.843606173992157, "adv/std_reasoning": 0.7753878235816956, "adv/std_step_conf": 0.9338046908378601, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8056117290192113, "calib/avg_num_step_conf": 5.6171875, "calib/ece": 0.2101417004048584, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6761133603238867, "calib/gap": 0.3927350859453996, "calib/mean_conf": 0.7373886639676113, "calib/mu_c": 0.8741304347826088, "calib/mu_w": 0.4813953488372092, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14785425101214586, "calib/std_conf": 0.40214979327082795, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5194275240384616, "calib/step_q_c_n": 832.0, "calib/step_q_gap": 0.1665397352595837, "calib/step_q_w": 0.3528877887788779, "calib/step_q_w_n": 606.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3023.0, "completions/max_terminated_length": 3023.0, "completions/mean_length": 512.07421875, "completions/mean_terminated_length": 512.07421875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.16, "grad_norm": 0.06127196177840233, "kl": 0.076141357421875, "learning_rate": 1.3888888888888892e-06, "loss": 0.0391, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03732970356941223, "mask/share_reasoning": 0.832332968711853, "mask/share_step_conf": 0.13033737242221832, "num_tokens": 35306296.0, "reward": 0.9579633474349976, "reward_std": 0.18849684298038483, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7547647356987, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.842411994934082, "step": 150 }, { "adv/mean_abs_final_conf": 0.655048668384552, "adv/mean_abs_reasoning": 0.46310627460479736, "adv/mean_abs_step_conf": 0.7712806463241577, "adv/ratio_final_to_reasoning": 1.4144672709164936, "adv/ratio_step_to_reasoning": 1.6654506505711852, "adv/std_final_conf": 0.8607724905014038, "adv/std_reasoning": 0.7207038402557373, "adv/std_step_conf": 0.9343538284301758, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7660818713450293, "calib/avg_num_step_conf": 5.66796875, "calib/ece": 0.23698795180722892, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.570281124497992, "calib/gap": 0.40570565302144246, "calib/mean_conf": 0.6256626506024096, "calib/mu_c": 0.8114074074074074, "calib/mu_w": 0.4057017543859649, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1602409638554217, "calib/std_conf": 0.44182951000839615, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4922701949860725, "calib/step_q_c_n": 718.0, "calib/step_q_gap": 0.15772722090694558, "calib/step_q_w": 0.3345429740791269, "calib/step_q_w_n": 733.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2932.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 553.15234375, "completions/mean_terminated_length": 557.5078735351562, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.16106666666666666, "grad_norm": 0.026984497904777527, "kl": 0.058818817138671875, "learning_rate": 1.3611111111111112e-06, "loss": -0.0642, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032044894993305206, "mask/share_reasoning": 0.8480304479598999, "mask/share_step_conf": 0.11211220920085907, "num_tokens": 35554927.0, "reward": 0.9289988875389099, "reward_std": 0.19891154766082764, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7304683923721313, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8267481327056885, "step": 151 }, { "adv/mean_abs_final_conf": 0.6867839097976685, "adv/mean_abs_reasoning": 0.5296306610107422, "adv/mean_abs_step_conf": 0.7530745267868042, "adv/ratio_final_to_reasoning": 1.2967223394639134, "adv/ratio_step_to_reasoning": 1.4218861977319135, "adv/std_final_conf": 0.8762410283088684, "adv/std_reasoning": 0.7928329706192017, "adv/std_step_conf": 0.9337801337242126, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7715010683760684, "calib/avg_num_step_conf": 6.03125, "calib/ece": 0.24040816326530617, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.563265306122449, "calib/gap": 0.3778725961538462, "calib/mean_conf": 0.651265306122449, "calib/mu_c": 0.8317187500000001, "calib/mu_w": 0.45384615384615384, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.18461224489795924, "calib/std_conf": 0.4215293233323014, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.49577348066298343, "calib/step_q_c_n": 724.0, "calib/step_q_gap": 0.13735884651664193, "calib/step_q_w": 0.3584146341463415, "calib/step_q_w_n": 820.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2585.0, "completions/max_terminated_length": 2585.0, "completions/mean_length": 561.15234375, "completions/mean_terminated_length": 565.5708618164062, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.16213333333333332, "grad_norm": 0.049046602100133896, "kl": 0.06014251708984375, "learning_rate": 1.3333333333333334e-06, "loss": -0.0531, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03138267248868942, "mask/share_reasoning": 0.8373855948448181, "mask/share_step_conf": 0.12341928482055664, "num_tokens": 35803974.0, "reward": 0.9200597405433655, "reward_std": 0.22718459367752075, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.7127765417098999, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8359366655349731, "step": 152 }, { "adv/mean_abs_final_conf": 0.6593279838562012, "adv/mean_abs_reasoning": 0.46595901250839233, "adv/mean_abs_step_conf": 0.7748792767524719, "adv/ratio_final_to_reasoning": 1.4149913751144068, "adv/ratio_step_to_reasoning": 1.6629773348112151, "adv/std_final_conf": 0.8450661301612854, "adv/std_reasoning": 0.7206805944442749, "adv/std_step_conf": 0.9337031245231628, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6682795698924732, "calib/avg_num_step_conf": 6.5625, "calib/ece": 0.2902057613168725, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6008230452674898, "calib/gap": 0.26399354838709677, "calib/mean_conf": 0.6720987654320988, "calib/mu_c": 0.7731333333333333, "calib/mu_w": 0.5091397849462366, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1725102880658437, "calib/std_conf": 0.42473341738330084, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5326021798365123, "calib/step_q_c_n": 734.0, "calib/step_q_gap": 0.25974805721494776, "calib/step_q_w": 0.2728541226215645, "calib/step_q_w_n": 946.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2562.0, "completions/max_terminated_length": 2562.0, "completions/mean_length": 503.84765625, "completions/mean_terminated_length": 526.4693603515625, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.1632, "grad_norm": 0.0726659968495369, "kl": 0.0702667236328125, "learning_rate": 1.3055555555555556e-06, "loss": -0.1991, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.030434779822826385, "mask/share_reasoning": 0.8228424787521362, "mask/share_step_conf": 0.10375404357910156, "num_tokens": 36040279.0, "reward": 0.8930153846740723, "reward_std": 0.1965997815132141, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6692812442779541, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8097181916236877, "step": 153 }, { "adv/mean_abs_final_conf": 0.620254397392273, "adv/mean_abs_reasoning": 0.4352233409881592, "adv/mean_abs_step_conf": 0.7530338764190674, "adv/ratio_final_to_reasoning": 1.4251404715197658, "adv/ratio_step_to_reasoning": 1.730224014891597, "adv/std_final_conf": 0.8530692458152771, "adv/std_reasoning": 0.7013433575630188, "adv/std_step_conf": 0.93345046043396, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7079229253878355, "calib/avg_num_step_conf": 5.10546875, "calib/ece": 0.28772549019607846, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6274509803921569, "calib/gap": 0.3493671509480424, "calib/mean_conf": 0.685607843137255, "calib/mu_c": 0.8554961832061069, "calib/mu_w": 0.5061290322580645, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2298039215686275, "calib/std_conf": 0.4278366959131226, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5391284403669725, "calib/step_q_c_n": 654.0, "calib/step_q_gap": 0.15605033929499695, "calib/step_q_w": 0.38307810107197554, "calib/step_q_w_n": 653.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2117.0, "completions/max_terminated_length": 2117.0, "completions/mean_length": 478.34765625, "completions/mean_terminated_length": 478.34765625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.16426666666666667, "grad_norm": 0.043535955250263214, "kl": 0.071868896484375, "learning_rate": 1.2777777777777779e-06, "loss": 0.0157, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03501860797405243, "mask/share_reasoning": 0.8479246497154236, "mask/share_step_conf": 0.1170567199587822, "num_tokens": 36267176.0, "reward": 0.9365236163139343, "reward_std": 0.187381774187088, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7093707323074341, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8621140122413635, "step": 154 }, { "adv/mean_abs_final_conf": 0.675777792930603, "adv/mean_abs_reasoning": 0.4805974066257477, "adv/mean_abs_step_conf": 0.7654271125793457, "adv/ratio_final_to_reasoning": 1.406120348578674, "adv/ratio_step_to_reasoning": 1.5926576007835214, "adv/std_final_conf": 0.857191264629364, "adv/std_reasoning": 0.7392981648445129, "adv/std_step_conf": 0.9342840313911438, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7550637659414854, "calib/avg_num_step_conf": 5.55859375, "calib/ece": 0.2551383399209485, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5652173913043478, "calib/gap": 0.4017823205801449, "calib/mean_conf": 0.6168774703557313, "calib/mu_c": 0.813798449612403, "calib/mu_w": 0.4120161290322581, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18106719367588922, "calib/std_conf": 0.4491829789994829, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4879407407407408, "calib/step_q_c_n": 675.0, "calib/step_q_gap": 0.15923753218459102, "calib/step_q_w": 0.32870320855614976, "calib/step_q_w_n": 748.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2004.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 485.56640625, "completions/mean_terminated_length": 487.4706115722656, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.16533333333333333, "grad_norm": 0.048178721219301224, "kl": 0.0730438232421875, "learning_rate": 1.25e-06, "loss": 0.0136, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0337291955947876, "mask/share_reasoning": 0.8391321897506714, "mask/share_step_conf": 0.12323231995105743, "num_tokens": 36498697.0, "reward": 0.9229997396469116, "reward_std": 0.18754538893699646, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.729051947593689, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.818510115146637, "step": 155 }, { "adv/mean_abs_final_conf": 0.6232823133468628, "adv/mean_abs_reasoning": 0.49731171131134033, "adv/mean_abs_step_conf": 0.7660905718803406, "adv/ratio_final_to_reasoning": 1.2533031078302095, "adv/ratio_step_to_reasoning": 1.5404635653165468, "adv/std_final_conf": 0.8286430835723877, "adv/std_reasoning": 0.7393476366996765, "adv/std_step_conf": 0.9338200688362122, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7351076476026944, "calib/avg_num_step_conf": 6.0625, "calib/ece": 0.2925101214574899, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6720647773279352, "calib/gap": 0.29858605204068167, "calib/mean_conf": 0.7248178137651821, "calib/mu_c": 0.8614179104477613, "calib/mu_w": 0.5628318584070796, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2374089068825911, "calib/std_conf": 0.4067893784099128, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.511150442477876, "calib/step_q_c_n": 791.0, "calib/step_q_gap": 0.13062481829916378, "calib/step_q_w": 0.3805256241787122, "calib/step_q_w_n": 761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2771.0, "completions/max_terminated_length": 2771.0, "completions/mean_length": 555.74609375, "completions/mean_terminated_length": 560.1220703125, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.1664, "grad_norm": 0.037357404828071594, "kl": 0.06532669067382812, "learning_rate": 1.2222222222222223e-06, "loss": 0.0328, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03344070166349411, "mask/share_reasoning": 0.8330366611480713, "mask/share_step_conf": 0.1257101595401764, "num_tokens": 36745728.0, "reward": 0.8930503129959106, "reward_std": 0.21639417111873627, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6766519546508789, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8117923736572266, "step": 156 }, { "adv/mean_abs_final_conf": 0.5315195322036743, "adv/mean_abs_reasoning": 0.46138864755630493, "adv/mean_abs_step_conf": 0.7667800188064575, "adv/ratio_final_to_reasoning": 1.1519995886739087, "adv/ratio_step_to_reasoning": 1.6618961538555943, "adv/std_final_conf": 0.7801711559295654, "adv/std_reasoning": 0.7206206321716309, "adv/std_step_conf": 0.9337723851203918, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8493701055498808, "calib/avg_num_step_conf": 6.1328125, "calib/ece": 0.15578740157480314, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6456692913385826, "calib/gap": 0.543925774599932, "calib/mean_conf": 0.6954724409448819, "calib/mu_c": 0.8860606060606062, "calib/mu_w": 0.3421348314606742, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10082677165354331, "calib/std_conf": 0.4248607613834598, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5660838445807772, "calib/step_q_c_n": 978.0, "calib/step_q_gap": 0.22544195268888534, "calib/step_q_w": 0.34064189189189187, "calib/step_q_w_n": 592.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2705.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 521.609375, "completions/mean_terminated_length": 523.6549072265625, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.16746666666666668, "grad_norm": 0.034724507480859756, "kl": 0.068572998046875, "learning_rate": 1.1944444444444446e-06, "loss": 0.0043, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033151306211948395, "mask/share_reasoning": 0.8360445499420166, "mask/share_step_conf": 0.1268979161977768, "num_tokens": 36982988.0, "reward": 1.0144009590148926, "reward_std": 0.16522559523582458, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.8308441638946533, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8706140518188477, "step": 157 }, { "adv/mean_abs_final_conf": 0.5890312194824219, "adv/mean_abs_reasoning": 0.42107200622558594, "adv/mean_abs_step_conf": 0.7548243403434753, "adv/ratio_final_to_reasoning": 1.3988847768874313, "adv/ratio_step_to_reasoning": 1.7926253210456462, "adv/std_final_conf": 0.7838677763938904, "adv/std_reasoning": 0.6816832423210144, "adv/std_step_conf": 0.9335840344429016, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6977124183006537, "calib/avg_num_step_conf": 5.88671875, "calib/ece": 0.2811904761904762, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7698412698412699, "calib/gap": 0.25526274509803903, "calib/mean_conf": 0.8084126984126985, "calib/mu_c": 0.9117333333333333, "calib/mu_w": 0.6564705882352943, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2471825396825397, "calib/std_conf": 0.3575321335284662, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.564733581164808, "calib/step_q_c_n": 807.0, "calib/step_q_gap": 0.16187643830766513, "calib/step_q_w": 0.40285714285714286, "calib/step_q_w_n": 700.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2211.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 488.40625, "completions/mean_terminated_length": 494.1976623535156, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.16853333333333334, "grad_norm": 0.030512472614645958, "kl": 0.068450927734375, "learning_rate": 1.1666666666666668e-06, "loss": -0.0752, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03567176312208176, "mask/share_reasoning": 0.8231402039527893, "mask/share_step_conf": 0.12946924567222595, "num_tokens": 37213260.0, "reward": 0.9132636189460754, "reward_std": 0.1889183521270752, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6938175559043884, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8194283843040466, "step": 158 }, { "adv/mean_abs_final_conf": 0.5502872467041016, "adv/mean_abs_reasoning": 0.4450033903121948, "adv/mean_abs_step_conf": 0.7452594637870789, "adv/ratio_final_to_reasoning": 1.2365911332002306, "adv/ratio_step_to_reasoning": 1.6747276088486371, "adv/std_final_conf": 0.8012663125991821, "adv/std_reasoning": 0.7205966711044312, "adv/std_step_conf": 0.9334879517555237, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.74140522875817, "calib/avg_num_step_conf": 5.42578125, "calib/ece": 0.20841269841269838, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.4341176470588235, "calib/mean_conf": 0.705952380952381, "calib/mu_c": 0.8816666666666667, "calib/mu_w": 0.4475490196078432, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15956349206349202, "calib/std_conf": 0.4222185896132073, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.555981308411215, "calib/step_q_c_n": 749.0, "calib/step_q_gap": 0.19538755841121497, "calib/step_q_w": 0.36059375, "calib/step_q_w_n": 640.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2803.0, "completions/max_terminated_length": 2803.0, "completions/mean_length": 498.8515625, "completions/mean_terminated_length": 500.807861328125, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.1696, "grad_norm": 0.040636204183101654, "kl": 0.071807861328125, "learning_rate": 1.138888888888889e-06, "loss": 0.0685, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03515962138772011, "mask/share_reasoning": 0.8426576852798462, "mask/share_step_conf": 0.11827646195888519, "num_tokens": 37445750.0, "reward": 0.9766671061515808, "reward_std": 0.16175855696201324, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7655757665634155, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8736958503723145, "step": 159 }, { "adv/mean_abs_final_conf": 0.6365013122558594, "adv/mean_abs_reasoning": 0.4862366318702698, "adv/mean_abs_step_conf": 0.7508884072303772, "adv/ratio_final_to_reasoning": 1.3090361164431579, "adv/ratio_step_to_reasoning": 1.5442859669830835, "adv/std_final_conf": 0.8413640856742859, "adv/std_reasoning": 0.7576702237129211, "adv/std_step_conf": 0.9344907402992249, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7269791666666666, "calib/avg_num_step_conf": 5.83203125, "calib/ece": 0.2636585365853658, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6097560975609756, "calib/gap": 0.34445416666666673, "calib/mean_conf": 0.6595121951219513, "calib/mu_c": 0.7939333333333334, "calib/mu_w": 0.44947916666666665, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15670731707317068, "calib/std_conf": 0.44032388976758857, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5100624219725343, "calib/step_q_c_n": 801.0, "calib/step_q_gap": 0.17267802890895045, "calib/step_q_w": 0.3373843930635838, "calib/step_q_w_n": 692.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2923.0, "completions/max_terminated_length": 2923.0, "completions/mean_length": 522.58984375, "completions/mean_terminated_length": 530.8849487304688, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.17066666666666666, "grad_norm": 0.04386411979794502, "kl": 0.06703948974609375, "learning_rate": 1.111111111111111e-06, "loss": -0.0188, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03261594846844673, "mask/share_reasoning": 0.8347195386886597, "mask/share_step_conf": 0.11703953146934509, "num_tokens": 37684373.0, "reward": 0.9110732674598694, "reward_std": 0.23112061619758606, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7011132836341858, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8116582036018372, "step": 160 }, { "adv/mean_abs_final_conf": 0.5453575253486633, "adv/mean_abs_reasoning": 0.4391787052154541, "adv/mean_abs_step_conf": 0.7485448718070984, "adv/ratio_final_to_reasoning": 1.2417667771052778, "adv/ratio_step_to_reasoning": 1.7044197792784015, "adv/std_final_conf": 0.7737199664115906, "adv/std_reasoning": 0.7206230759620667, "adv/std_step_conf": 0.9342144131660461, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7818750954344175, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.18187250996015936, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6573705179282868, "calib/gap": 0.44304168575355013, "calib/mean_conf": 0.7197211155378486, "calib/mu_c": 0.8503389830508474, "calib/mu_w": 0.4072972972972973, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09820717131474105, "calib/std_conf": 0.4102000312649325, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5249844559585491, "calib/step_q_c_n": 965.0, "calib/step_q_gap": 0.14935808233217557, "calib/step_q_w": 0.3756263736263736, "calib/step_q_w_n": 455.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1989.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 472.8984375, "completions/mean_terminated_length": 480.40478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.17173333333333332, "grad_norm": 0.043638553470373154, "kl": 0.06868743896484375, "learning_rate": 1.0833333333333335e-06, "loss": -0.0168, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03504636883735657, "mask/share_reasoning": 0.8246728777885437, "mask/share_step_conf": 0.12465573847293854, "num_tokens": 37909355.0, "reward": 0.9816228747367859, "reward_std": 0.18595723807811737, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7881566286087036, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8414953947067261, "step": 161 }, { "adv/mean_abs_final_conf": 0.5180578827857971, "adv/mean_abs_reasoning": 0.31734955310821533, "adv/mean_abs_step_conf": 0.7810468077659607, "adv/ratio_final_to_reasoning": 1.6324519058299753, "adv/ratio_step_to_reasoning": 2.461156160820639, "adv/std_final_conf": 0.762174665927887, "adv/std_reasoning": 0.5960145592689514, "adv/std_step_conf": 0.9328275322914124, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7803224056420988, "calib/avg_num_step_conf": 5.2265625, "calib/ece": 0.15679687500000003, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.73046875, "calib/gap": 0.45043788266294665, "calib/mean_conf": 0.780625, "calib/mu_c": 0.9020320855614974, "calib/mu_w": 0.45159420289855073, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10347656250000004, "calib/std_conf": 0.37814772658975493, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5128571428571428, "calib/step_q_c_n": 959.0, "calib/step_q_gap": 0.0898228420655861, "calib/step_q_w": 0.4230343007915567, "calib/step_q_w_n": 379.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 433.8125, "completions/mean_terminated_length": 435.5137634277344, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.1728, "grad_norm": 0.04205428436398506, "kl": 0.09763336181640625, "learning_rate": 1.0555555555555557e-06, "loss": 0.102, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03682439774274826, "mask/share_reasoning": 0.8325048685073853, "mask/share_step_conf": 0.1267644464969635, "num_tokens": 38124555.0, "reward": 1.028838038444519, "reward_std": 0.14918027818202972, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.834972620010376, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8766096830368042, "step": 162 }, { "adv/mean_abs_final_conf": 0.5653954148292542, "adv/mean_abs_reasoning": 0.4558749794960022, "adv/mean_abs_step_conf": 0.7330037355422974, "adv/ratio_final_to_reasoning": 1.2402422599598109, "adv/ratio_step_to_reasoning": 1.607905168107006, "adv/std_final_conf": 0.8036025762557983, "adv/std_reasoning": 0.7207197546958923, "adv/std_step_conf": 0.9333640336990356, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7831508114526982, "calib/avg_num_step_conf": 6.44140625, "calib/ece": 0.19638554216867476, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5783132530120482, "calib/gap": 0.4668135637946959, "calib/mean_conf": 0.6514859437751004, "calib/mu_c": 0.8502097902097903, "calib/mu_w": 0.38339622641509435, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1367871485943776, "calib/std_conf": 0.43866148268199695, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5261176470588236, "calib/step_q_c_n": 765.0, "calib/step_q_gap": 0.21495248868778288, "calib/step_q_w": 0.3111651583710407, "calib/step_q_w_n": 884.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2422.0, "completions/max_terminated_length": 2422.0, "completions/mean_length": 523.34375, "completions/mean_terminated_length": 531.6508178710938, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.17386666666666667, "grad_norm": 0.04157635197043419, "kl": 0.07187652587890625, "learning_rate": 1.0277777777777777e-06, "loss": -0.0034, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03407448157668114, "mask/share_reasoning": 0.8247135877609253, "mask/share_step_conf": 0.12558691203594208, "num_tokens": 38363363.0, "reward": 0.9599740505218506, "reward_std": 0.19809526205062866, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7604609131813049, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8540183305740356, "step": 163 }, { "adv/mean_abs_final_conf": 0.6849848031997681, "adv/mean_abs_reasoning": 0.5212238430976868, "adv/mean_abs_step_conf": 0.7627356648445129, "adv/ratio_final_to_reasoning": 1.3141854738049454, "adv/ratio_step_to_reasoning": 1.463355283809537, "adv/std_final_conf": 0.8790825605392456, "adv/std_reasoning": 0.7753711342811584, "adv/std_step_conf": 0.9340617656707764, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7601245085190039, "calib/avg_num_step_conf": 6.25, "calib/ece": 0.2584337349397591, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5742971887550201, "calib/gap": 0.3704062909567496, "calib/mean_conf": 0.6364257028112449, "calib/mu_c": 0.7985714285714286, "calib/mu_w": 0.428165137614679, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16630522088353417, "calib/std_conf": 0.44016480518654333, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5005243902439025, "calib/step_q_c_n": 820.0, "calib/step_q_gap": 0.1758192620387743, "calib/step_q_w": 0.3247051282051282, "calib/step_q_w_n": 780.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2644.0, "completions/max_terminated_length": 2644.0, "completions/mean_length": 560.58203125, "completions/mean_terminated_length": 569.4801635742188, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.17493333333333333, "grad_norm": 0.04676659405231476, "kl": 0.0653533935546875, "learning_rate": 1.0000000000000002e-06, "loss": -0.0637, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02996118739247322, "mask/share_reasoning": 0.8411446809768677, "mask/share_step_conf": 0.11326909065246582, "num_tokens": 38613008.0, "reward": 0.9299562573432922, "reward_std": 0.20216163992881775, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7168090343475342, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8391972184181213, "step": 164 }, { "adv/mean_abs_final_conf": 0.4965021014213562, "adv/mean_abs_reasoning": 0.3882831335067749, "adv/mean_abs_step_conf": 0.7505602836608887, "adv/ratio_final_to_reasoning": 1.2787114828738577, "adv/ratio_step_to_reasoning": 1.9330231444312598, "adv/std_final_conf": 0.7457696795463562, "adv/std_reasoning": 0.7012252807617188, "adv/std_step_conf": 0.9331449866294861, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7731105462210924, "calib/avg_num_step_conf": 5.2734375, "calib/ece": 0.24417322834645674, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5905511811023622, "calib/gap": 0.44362204724409443, "calib/mean_conf": 0.6433858267716536, "calib/mu_c": 0.8651968503937008, "calib/mu_w": 0.4215748031496063, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19377952755905517, "calib/std_conf": 0.44615897112537706, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5396116504854369, "calib/step_q_c_n": 618.0, "calib/step_q_gap": 0.17704334447450792, "calib/step_q_w": 0.362568306010929, "calib/step_q_w_n": 732.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 518.08203125, "completions/mean_terminated_length": 518.08203125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.176, "grad_norm": 0.06405764073133469, "kl": 0.07071685791015625, "learning_rate": 9.722222222222224e-07, "loss": 0.0184, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03335200995206833, "mask/share_reasoning": 0.8532591462135315, "mask/share_step_conf": 0.11338884383440018, "num_tokens": 38851213.0, "reward": 0.9515175819396973, "reward_std": 0.15023109316825867, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7463171482086182, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8590617179870605, "step": 165 }, { "adv/mean_abs_final_conf": 0.490718275308609, "adv/mean_abs_reasoning": 0.3937041163444519, "adv/mean_abs_step_conf": 0.7531986832618713, "adv/ratio_final_to_reasoning": 1.2464138802127214, "adv/ratio_step_to_reasoning": 1.9131084791678872, "adv/std_final_conf": 0.7293896675109863, "adv/std_reasoning": 0.6613433361053467, "adv/std_step_conf": 0.9326786398887634, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8938230994152047, "calib/avg_num_step_conf": 5.84765625, "calib/ece": 0.12338645418326699, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6135458167330677, "calib/gap": 0.6554312865497076, "calib/mean_conf": 0.6602788844621513, "calib/mu_c": 0.8691812865497076, "calib/mu_w": 0.21375000000000002, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.051195219123506046, "calib/std_conf": 0.444739417316579, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5190594594594594, "calib/step_q_c_n": 925.0, "calib/step_q_gap": 0.21079022869022868, "calib/step_q_w": 0.30826923076923074, "calib/step_q_w_n": 572.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2650.0, "completions/max_terminated_length": 2650.0, "completions/mean_length": 535.546875, "completions/mean_terminated_length": 537.6470947265625, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.17706666666666668, "grad_norm": 0.04973738268017769, "kl": 0.16271209716796875, "learning_rate": 9.444444444444445e-07, "loss": -0.0363, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03301393240690231, "mask/share_reasoning": 0.8391966223716736, "mask/share_step_conf": 0.1238832175731659, "num_tokens": 39094497.0, "reward": 1.0374794006347656, "reward_std": 0.14901401102542877, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.8522887229919434, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8929827213287354, "step": 166 }, { "adv/mean_abs_final_conf": 0.5144798755645752, "adv/mean_abs_reasoning": 0.4528118073940277, "adv/mean_abs_step_conf": 0.7564898729324341, "adv/ratio_final_to_reasoning": 1.1361891787350968, "adv/ratio_step_to_reasoning": 1.670649617743187, "adv/std_final_conf": 0.7716760039329529, "adv/std_reasoning": 0.7205672860145569, "adv/std_step_conf": 0.93340003490448, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6497036657642496, "calib/avg_num_step_conf": 5.40234375, "calib/ece": 0.228888888888889, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.3497182995536694, "calib/mean_conf": 0.7766666666666666, "calib/mu_c": 0.886300578034682, "calib/mu_w": 0.5365822784810126, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15952380952380965, "calib/std_conf": 0.392903316189894, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.544224318658281, "calib/step_q_c_n": 954.0, "calib/step_q_gap": 0.0957627801967425, "calib/step_q_w": 0.4484615384615385, "calib/step_q_w_n": 429.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2540.0, "completions/max_terminated_length": 2540.0, "completions/mean_length": 493.9296875, "completions/mean_terminated_length": 495.86669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.17813333333333334, "grad_norm": 0.05870332941412926, "kl": 0.0638885498046875, "learning_rate": 9.166666666666666e-07, "loss": 0.0712, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03407091647386551, "mask/share_reasoning": 0.8457885980606079, "mask/share_step_conf": 0.1162342056632042, "num_tokens": 39326551.0, "reward": 0.9671033024787903, "reward_std": 0.17421361804008484, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7607374787330627, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8414378762245178, "step": 167 }, { "adv/mean_abs_final_conf": 0.5943257808685303, "adv/mean_abs_reasoning": 0.4640789330005646, "adv/mean_abs_step_conf": 0.7342292070388794, "adv/ratio_final_to_reasoning": 1.2806566698165702, "adv/ratio_step_to_reasoning": 1.5821213910565213, "adv/std_final_conf": 0.8195285797119141, "adv/std_reasoning": 0.701535701751709, "adv/std_step_conf": 0.9333325624465942, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7641162029459903, "calib/avg_num_step_conf": 5.92578125, "calib/ece": 0.2363199999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.616, "calib/gap": 0.41958537915984717, "calib/mean_conf": 0.6648, "calib/mu_c": 0.8225641025641025, "calib/mu_w": 0.40297872340425533, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1385599999999999, "calib/std_conf": 0.43775902046674037, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5240274314214464, "calib/step_q_c_n": 802.0, "calib/step_q_gap": 0.20019526358927853, "calib/step_q_w": 0.32383216783216784, "calib/step_q_w_n": 715.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2319.0, "completions/max_terminated_length": 2319.0, "completions/mean_length": 535.79296875, "completions/mean_terminated_length": 546.4661254882812, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.1792, "grad_norm": 0.02657085470855236, "kl": 0.06554412841796875, "learning_rate": 8.88888888888889e-07, "loss": -0.1271, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03261389583349228, "mask/share_reasoning": 0.8362942934036255, "mask/share_step_conf": 0.11156059056520462, "num_tokens": 39568386.0, "reward": 0.9644992351531982, "reward_std": 0.18663950264453888, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7509452700614929, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8608657121658325, "step": 168 }, { "adv/mean_abs_final_conf": 0.5595456957817078, "adv/mean_abs_reasoning": 0.4441044330596924, "adv/mean_abs_step_conf": 0.7435603141784668, "adv/ratio_final_to_reasoning": 1.2599417031860587, "adv/ratio_step_to_reasoning": 1.6742915828505687, "adv/std_final_conf": 0.7765763401985168, "adv/std_reasoning": 0.7014807462692261, "adv/std_step_conf": 0.9342918395996094, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7951638689048761, "calib/avg_num_step_conf": 5.19140625, "calib/ece": 0.20862348178137655, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5951417004048583, "calib/gap": 0.4529303224087397, "calib/mean_conf": 0.6636842105263158, "calib/mu_c": 0.861726618705036, "calib/mu_w": 0.40879629629629627, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1547773279352227, "calib/std_conf": 0.4324253574325776, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5857077625570777, "calib/step_q_c_n": 657.0, "calib/step_q_gap": 0.2105887149380301, "calib/step_q_w": 0.3751190476190476, "calib/step_q_w_n": 672.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2674.0, "completions/max_terminated_length": 2674.0, "completions/mean_length": 504.5625, "completions/mean_terminated_length": 510.54547119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.18026666666666666, "grad_norm": 0.04750969633460045, "kl": 0.06903839111328125, "learning_rate": 8.611111111111112e-07, "loss": -0.0662, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03275205194950104, "mask/share_reasoning": 0.845567524433136, "mask/share_step_conf": 0.10996170341968536, "num_tokens": 39801738.0, "reward": 0.9409176707267761, "reward_std": 0.2118673324584961, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7484175562858582, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8326364755630493, "step": 169 }, { "adv/mean_abs_final_conf": 0.5320782661437988, "adv/mean_abs_reasoning": 0.38035115599632263, "adv/mean_abs_step_conf": 0.7660174369812012, "adv/ratio_final_to_reasoning": 1.3989132351919107, "adv/ratio_step_to_reasoning": 2.0139742574848576, "adv/std_final_conf": 0.778958797454834, "adv/std_reasoning": 0.6612679958343506, "adv/std_step_conf": 0.9337641000747681, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7838235294117648, "calib/avg_num_step_conf": 5.72265625, "calib/ece": 0.20632411067193668, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7075098814229249, "calib/gap": 0.46325751633986934, "calib/mean_conf": 0.7273517786561265, "calib/mu_c": 0.9104575163398694, "calib/mu_w": 0.44720000000000004, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16446640316205524, "calib/std_conf": 0.42422831914620107, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5701946472019465, "calib/step_q_c_n": 822.0, "calib/step_q_gap": 0.21999246990801175, "calib/step_q_w": 0.3502021772939347, "calib/step_q_w_n": 643.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2636.0, "completions/max_terminated_length": 2636.0, "completions/mean_length": 525.0, "completions/mean_terminated_length": 527.058837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.18133333333333335, "grad_norm": 0.03782640025019646, "kl": 0.0851593017578125, "learning_rate": 8.333333333333333e-07, "loss": 0.0339, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032207902520895004, "mask/share_reasoning": 0.8428486585617065, "mask/share_step_conf": 0.12103715538978577, "num_tokens": 40040290.0, "reward": 0.9744622707366943, "reward_std": 0.19172057509422302, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7742984294891357, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8582197427749634, "step": 170 }, { "adv/mean_abs_final_conf": 0.5948938131332397, "adv/mean_abs_reasoning": 0.3816094398498535, "adv/mean_abs_step_conf": 0.7755590677261353, "adv/ratio_final_to_reasoning": 1.5589074876326545, "adv/ratio_step_to_reasoning": 2.032337218993544, "adv/std_final_conf": 0.8434175252914429, "adv/std_reasoning": 0.6814701557159424, "adv/std_step_conf": 0.9340186715126038, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7388299266247379, "calib/avg_num_step_conf": 5.96875, "calib/ece": 0.27612000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.552, "calib/gap": 0.3564544025157233, "calib/mean_conf": 0.62428, "calib/mu_c": 0.7754166666666666, "calib/mu_w": 0.41896226415094334, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16219999999999998, "calib/std_conf": 0.4515622676885216, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5402421307506053, "calib/step_q_c_n": 826.0, "calib/step_q_gap": 0.20239312790160247, "calib/step_q_w": 0.3378490028490029, "calib/step_q_w_n": 702.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2935.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 513.85546875, "completions/mean_terminated_length": 519.9486083984375, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.1824, "grad_norm": 0.05200982093811035, "kl": 0.06806182861328125, "learning_rate": 8.055555555555557e-07, "loss": -0.0064, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.032680168747901917, "mask/share_reasoning": 0.8358561396598816, "mask/share_step_conf": 0.11974497139453888, "num_tokens": 40278733.0, "reward": 0.9288033843040466, "reward_std": 0.17771439254283905, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7066855430603027, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8431086540222168, "step": 171 }, { "adv/mean_abs_final_conf": 0.5560131669044495, "adv/mean_abs_reasoning": 0.39456892013549805, "adv/mean_abs_step_conf": 0.7697474956512451, "adv/ratio_final_to_reasoning": 1.409166152046416, "adv/ratio_step_to_reasoning": 1.9508568880359554, "adv/std_final_conf": 0.7747467756271362, "adv/std_reasoning": 0.6403370499610901, "adv/std_step_conf": 0.9329255819320679, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7173851789236405, "calib/avg_num_step_conf": 5.43359375, "calib/ece": 0.2423320158102767, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7312252964426877, "calib/gap": 0.27048393913778535, "calib/mean_conf": 0.788893280632411, "calib/mu_c": 0.878698224852071, "calib/mu_w": 0.6082142857142857, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1816205533596838, "calib/std_conf": 0.3672539380512327, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5233668903803131, "calib/step_q_c_n": 894.0, "calib/step_q_gap": 0.10366870124550431, "calib/step_q_w": 0.41969818913480883, "calib/step_q_w_n": 497.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 493.23046875, "completions/mean_terminated_length": 495.16473388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.18346666666666667, "grad_norm": 0.04100465774536133, "kl": 0.07511138916015625, "learning_rate": 7.777777777777779e-07, "loss": 0.008, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0354820191860199, "mask/share_reasoning": 0.8346220254898071, "mask/share_step_conf": 0.12598973512649536, "num_tokens": 40508352.0, "reward": 0.9622762203216553, "reward_std": 0.1651277393102646, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7399269342422485, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8549380302429199, "step": 172 }, { "adv/mean_abs_final_conf": 0.4922311305999756, "adv/mean_abs_reasoning": 0.4386613965034485, "adv/mean_abs_step_conf": 0.7494137287139893, "adv/ratio_final_to_reasoning": 1.1221209217941883, "adv/ratio_step_to_reasoning": 1.7084104840032301, "adv/std_final_conf": 0.7518383264541626, "adv/std_reasoning": 0.7205957174301147, "adv/std_step_conf": 0.9345707893371582, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6923623951073815, "calib/avg_num_step_conf": 5.69140625, "calib/ece": 0.24757085020242917, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7854251012145749, "calib/gap": 0.2964066277912103, "calib/mean_conf": 0.8197165991902833, "calib/mu_c": 0.9265189873417721, "calib/mu_w": 0.6301123595505618, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.21380566801619436, "calib/std_conf": 0.3551283064132805, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5629658213891952, "calib/step_q_c_n": 907.0, "calib/step_q_gap": 0.11854763957101333, "calib/step_q_w": 0.4444181818181818, "calib/step_q_w_n": 550.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2519.0, "completions/max_terminated_length": 2519.0, "completions/mean_length": 524.23046875, "completions/mean_terminated_length": 532.5516357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 91.0, "epoch": 0.18453333333333333, "grad_norm": 0.027799900621175766, "kl": 0.06717681884765625, "learning_rate": 7.5e-07, "loss": -0.0282, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.034872934222221375, "mask/share_reasoning": 0.8314304947853088, "mask/share_step_conf": 0.11807158589363098, "num_tokens": 40745715.0, "reward": 0.9166386127471924, "reward_std": 0.18919737637043, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7176336050033569, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8000184297561646, "step": 173 }, { "adv/mean_abs_final_conf": 0.7102653980255127, "adv/mean_abs_reasoning": 0.6087853908538818, "adv/mean_abs_step_conf": 0.7541342973709106, "adv/ratio_final_to_reasoning": 1.1666925795136034, "adv/ratio_step_to_reasoning": 1.2387522905455444, "adv/std_final_conf": 0.8909445405006409, "adv/std_reasoning": 0.8267565369606018, "adv/std_step_conf": 0.9344921708106995, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6303801652892562, "calib/avg_num_step_conf": 5.8203125, "calib/ece": 0.3572357723577235, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.556910569105691, "calib/gap": 0.21883636363636372, "calib/mean_conf": 0.6324390243902438, "calib/mu_c": 0.7436363636363637, "calib/mu_w": 0.5247999999999999, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24890243902439022, "calib/std_conf": 0.4382417595094173, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5201589595375722, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.09376798209396309, "calib/step_q_w": 0.4263909774436091, "calib/step_q_w_n": 798.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2789.0, "completions/max_terminated_length": 2789.0, "completions/mean_length": 593.65234375, "completions/mean_terminated_length": 600.6917114257812, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.1856, "grad_norm": 0.04501314088702202, "kl": 0.059535980224609375, "learning_rate": 7.222222222222222e-07, "loss": -0.1192, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.029303016141057014, "mask/share_reasoning": 0.8484967350959778, "mask/share_step_conf": 0.11048145592212677, "num_tokens": 41001922.0, "reward": 0.8417038917541504, "reward_std": 0.24447256326675415, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.6223413944244385, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7743476629257202, "step": 174 }, { "adv/mean_abs_final_conf": 0.6777516603469849, "adv/mean_abs_reasoning": 0.5734829902648926, "adv/mean_abs_step_conf": 0.7648620009422302, "adv/ratio_final_to_reasoning": 1.1818164999696512, "adv/ratio_step_to_reasoning": 1.3337134909423196, "adv/std_final_conf": 0.867162823677063, "adv/std_reasoning": 0.7928904891014099, "adv/std_step_conf": 0.9347115159034729, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7657891281512604, "calib/avg_num_step_conf": 5.64453125, "calib/ece": 0.26375000000000004, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5120967741935484, "calib/gap": 0.41025210084033614, "calib/mean_conf": 0.5627016129032258, "calib/mu_c": 0.7876785714285715, "calib/mu_w": 0.3774264705882353, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1874193548387097, "calib/std_conf": 0.46484601126711506, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.540097244732577, "calib/step_q_c_n": 617.0, "calib/step_q_gap": 0.16794748627847078, "calib/step_q_w": 0.37214975845410625, "calib/step_q_w_n": 828.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2534.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 535.44921875, "completions/mean_terminated_length": 546.1155395507812, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.18666666666666668, "grad_norm": 0.0433058999478817, "kl": 0.068878173828125, "learning_rate": 6.944444444444446e-07, "loss": -0.173, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03162894770503044, "mask/share_reasoning": 0.8328102827072144, "mask/share_step_conf": 0.11602950841188431, "num_tokens": 41244821.0, "reward": 0.8938636183738708, "reward_std": 0.2151774913072586, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.7044011354446411, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8028572797775269, "step": 175 }, { "adv/mean_abs_final_conf": 0.6084058880805969, "adv/mean_abs_reasoning": 0.5545529127120972, "adv/mean_abs_step_conf": 0.7464092969894409, "adv/ratio_final_to_reasoning": 1.0971106167401166, "adv/ratio_step_to_reasoning": 1.3459658760767312, "adv/std_final_conf": 0.8169024586677551, "adv/std_reasoning": 0.7928866744041443, "adv/std_step_conf": 0.9332429766654968, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7914652162234757, "calib/avg_num_step_conf": 5.66015625, "calib/ece": 0.2469758064516129, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6532258064516129, "calib/gap": 0.42206956755304864, "calib/mean_conf": 0.6859274193548388, "calib/mu_c": 0.8595205479452055, "calib/mu_w": 0.43745098039215685, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17209677419354838, "calib/std_conf": 0.43977810888518604, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.553955223880597, "calib/step_q_c_n": 804.0, "calib/step_q_gap": 0.18745909984958925, "calib/step_q_w": 0.3664961240310078, "calib/step_q_w_n": 645.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3014.0, "completions/max_terminated_length": 3014.0, "completions/mean_length": 563.66796875, "completions/mean_terminated_length": 563.66796875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.18773333333333334, "grad_norm": 0.04436732083559036, "kl": 0.06317901611328125, "learning_rate": 6.666666666666667e-07, "loss": 0.0278, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.035566989332437515, "mask/share_reasoning": 0.8393169045448303, "mask/share_step_conf": 0.12511610984802246, "num_tokens": 41493184.0, "reward": 0.9451636075973511, "reward_std": 0.21857215464115143, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7356737852096558, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8468407392501831, "step": 176 }, { "adv/mean_abs_final_conf": 0.5914150476455688, "adv/mean_abs_reasoning": 0.37636247277259827, "adv/mean_abs_step_conf": 0.7463341951370239, "adv/ratio_final_to_reasoning": 1.5713974969096012, "adv/ratio_step_to_reasoning": 1.9830196927948394, "adv/std_final_conf": 0.8054783940315247, "adv/std_reasoning": 0.6612831354141235, "adv/std_step_conf": 0.9332849383354187, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7522977346278318, "calib/avg_num_step_conf": 6.0234375, "calib/ece": 0.2451778656126482, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6047430830039525, "calib/gap": 0.40564854368932046, "calib/mean_conf": 0.6554545454545455, "calib/mu_c": 0.8206000000000001, "calib/mu_w": 0.41495145631067964, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15387351778656125, "calib/std_conf": 0.44099722031906413, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5324347826086957, "calib/step_q_c_n": 805.0, "calib/step_q_gap": 0.18633505397911637, "calib/step_q_w": 0.34609972862957933, "calib/step_q_w_n": 737.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2976.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 547.703125, "completions/mean_terminated_length": 549.8510131835938, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.1888, "grad_norm": 0.04313787445425987, "kl": 0.06238555908203125, "learning_rate": 6.388888888888889e-07, "loss": -0.002, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032127976417541504, "mask/share_reasoning": 0.8453376889228821, "mask/share_step_conf": 0.11862808465957642, "num_tokens": 41737228.0, "reward": 0.9590833783149719, "reward_std": 0.16259649395942688, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7471988201141357, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8561241626739502, "step": 177 }, { "adv/mean_abs_final_conf": 0.5764954090118408, "adv/mean_abs_reasoning": 0.5464339852333069, "adv/mean_abs_step_conf": 0.7638071775436401, "adv/ratio_final_to_reasoning": 1.0550138252577734, "adv/ratio_step_to_reasoning": 1.3978032080444687, "adv/std_final_conf": 0.8018712401390076, "adv/std_reasoning": 0.7928348779678345, "adv/std_step_conf": 0.9337645173072815, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8475838926174497, "calib/avg_num_step_conf": 5.27734375, "calib/ece": 0.1528112449799197, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.570281124497992, "calib/gap": 0.5750194630872483, "calib/mean_conf": 0.636987951807229, "calib/mu_c": 0.8679194630872483, "calib/mu_w": 0.2929, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.09570281124497995, "calib/std_conf": 0.4432765958372831, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5526944444444445, "calib/step_q_c_n": 720.0, "calib/step_q_gap": 0.23023802606092625, "calib/step_q_w": 0.32245641838351824, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 502.83984375, "completions/mean_terminated_length": 508.8023986816406, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.18986666666666666, "grad_norm": 0.03984364494681358, "kl": 0.067535400390625, "learning_rate": 6.111111111111112e-07, "loss": 0.0287, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03440903127193451, "mask/share_reasoning": 0.8346933722496033, "mask/share_step_conf": 0.11917882412672043, "num_tokens": 41972027.0, "reward": 0.9897167682647705, "reward_std": 0.18722940981388092, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.8117296695709229, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8575475811958313, "step": 178 }, { "adv/mean_abs_final_conf": 0.5882589817047119, "adv/mean_abs_reasoning": 0.4754542112350464, "adv/mean_abs_step_conf": 0.7456690073013306, "adv/ratio_final_to_reasoning": 1.2372568541913684, "adv/ratio_step_to_reasoning": 1.5683297984980942, "adv/std_final_conf": 0.823574960231781, "adv/std_reasoning": 0.739263117313385, "adv/std_step_conf": 0.9337601661682129, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8059141201264488, "calib/avg_num_step_conf": 5.40625, "calib/ece": 0.19980000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.62, "calib/gap": 0.44298867228661754, "calib/mean_conf": 0.6828399999999999, "calib/mu_c": 0.8671232876712329, "calib/mu_w": 0.4241346153846154, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14932000000000006, "calib/std_conf": 0.42460939038132445, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.582780678851175, "calib/step_q_c_n": 766.0, "calib/step_q_gap": 0.21857355911007464, "calib/step_q_w": 0.36420711974110037, "calib/step_q_w_n": 618.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3039.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 481.26953125, "completions/mean_terminated_length": 488.90875244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.19093333333333334, "grad_norm": 0.05356639623641968, "kl": 0.06893157958984375, "learning_rate": 5.833333333333334e-07, "loss": 0.0005, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03361428156495094, "mask/share_reasoning": 0.8343751430511475, "mask/share_step_conf": 0.11638560891151428, "num_tokens": 42201496.0, "reward": 0.9610702395439148, "reward_std": 0.18741470575332642, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7639027237892151, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8488626480102539, "step": 179 }, { "adv/mean_abs_final_conf": 0.5343791842460632, "adv/mean_abs_reasoning": 0.28797054290771484, "adv/mean_abs_step_conf": 0.7218352556228638, "adv/ratio_final_to_reasoning": 1.855673079788978, "adv/ratio_step_to_reasoning": 2.506628797287049, "adv/std_final_conf": 0.7881537675857544, "adv/std_reasoning": 0.6184922456741333, "adv/std_step_conf": 0.9330692887306213, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7854037059335073, "calib/avg_num_step_conf": 5.875, "calib/ece": 0.17584000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.604, "calib/gap": 0.4683497223894574, "calib/mean_conf": 0.6805599999999999, "calib/mu_c": 0.866026490066225, "calib/mu_w": 0.39767676767676763, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12620000000000003, "calib/std_conf": 0.4189650181101043, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5608493771234428, "calib/step_q_c_n": 883.0, "calib/step_q_gap": 0.13266902285613208, "calib/step_q_w": 0.42818035426731077, "calib/step_q_w_n": 621.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2876.0, "completions/max_terminated_length": 2876.0, "completions/mean_length": 596.83984375, "completions/mean_terminated_length": 606.3135375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.192, "grad_norm": 0.05060546100139618, "kl": 0.05588531494140625, "learning_rate": 5.555555555555555e-07, "loss": -0.0949, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.029479168355464935, "mask/share_reasoning": 0.8440501093864441, "mask/share_step_conf": 0.11084578931331635, "num_tokens": 42458143.0, "reward": 0.9723742604255676, "reward_std": 0.15452314913272858, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7846351265907288, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8468320965766907, "step": 180 }, { "adv/mean_abs_final_conf": 0.6660337448120117, "adv/mean_abs_reasoning": 0.5127053260803223, "adv/mean_abs_step_conf": 0.7276250123977661, "adv/ratio_final_to_reasoning": 1.2990575890909868, "adv/ratio_step_to_reasoning": 1.4191875437700714, "adv/std_final_conf": 0.8656556606292725, "adv/std_reasoning": 0.7753406763076782, "adv/std_step_conf": 0.9336156249046326, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7831430532160459, "calib/avg_num_step_conf": 5.578125, "calib/ece": 0.24133858267716535, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.562992125984252, "calib/gap": 0.3968432216607398, "calib/mean_conf": 0.6356692913385826, "calib/mu_c": 0.8184671532846715, "calib/mu_w": 0.42162393162393164, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16881889763779526, "calib/std_conf": 0.4343962967156663, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5725960264900662, "calib/step_q_c_n": 755.0, "calib/step_q_gap": 0.19060494179467252, "calib/step_q_w": 0.3819910846953937, "calib/step_q_w_n": 673.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2227.0, "completions/max_terminated_length": 2227.0, "completions/mean_length": 494.2265625, "completions/mean_terminated_length": 496.16473388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.19306666666666666, "grad_norm": 0.035632502287626266, "kl": 0.07423782348632812, "learning_rate": 5.277777777777779e-07, "loss": -0.0462, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033344317227602005, "mask/share_reasoning": 0.839685320854187, "mask/share_step_conf": 0.12306413054466248, "num_tokens": 42690929.0, "reward": 0.9542344212532043, "reward_std": 0.2006225287914276, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7449023723602295, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8580977320671082, "step": 181 }, { "adv/mean_abs_final_conf": 0.5287920236587524, "adv/mean_abs_reasoning": 0.3742181062698364, "adv/mean_abs_step_conf": 0.7488981485366821, "adv/ratio_final_to_reasoning": 1.4130583603495066, "adv/ratio_step_to_reasoning": 2.001234403117513, "adv/std_final_conf": 0.7815595865249634, "adv/std_reasoning": 0.6815541982650757, "adv/std_step_conf": 0.9339718818664551, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8582563896694768, "calib/avg_num_step_conf": 5.765625, "calib/ece": 0.16573122529644274, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6837944664031621, "calib/gap": 0.5054824033186134, "calib/mean_conf": 0.7398023715415021, "calib/mu_c": 0.9276100628930815, "calib/mu_w": 0.4221276595744681, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13853754940711469, "calib/std_conf": 0.39932200504078263, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5976770186335404, "calib/step_q_c_n": 805.0, "calib/step_q_gap": 0.2225652451611112, "calib/step_q_w": 0.3751117734724292, "calib/step_q_w_n": 671.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2116.0, "completions/max_terminated_length": 2116.0, "completions/mean_length": 496.69140625, "completions/mean_terminated_length": 502.5810546875, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.19413333333333332, "grad_norm": 0.03509395942091942, "kl": 0.0664520263671875, "learning_rate": 5.000000000000001e-07, "loss": -0.0732, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03328807279467583, "mask/share_reasoning": 0.8332520127296448, "mask/share_step_conf": 0.12174117565155029, "num_tokens": 42924242.0, "reward": 1.005211353302002, "reward_std": 0.1748979389667511, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.8209699392318726, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8675776720046997, "step": 182 }, { "adv/mean_abs_final_conf": 0.6386405825614929, "adv/mean_abs_reasoning": 0.5655806064605713, "adv/mean_abs_step_conf": 0.761902928352356, "adv/ratio_final_to_reasoning": 1.129176947134263, "adv/ratio_step_to_reasoning": 1.3471164315912076, "adv/std_final_conf": 0.8399648070335388, "adv/std_reasoning": 0.7929946184158325, "adv/std_step_conf": 0.9347866773605347, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7205112721417068, "calib/avg_num_step_conf": 5.140625, "calib/ece": 0.27548780487804875, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.556910569105691, "calib/gap": 0.323743961352657, "calib/mean_conf": 0.6293902439024389, "calib/mu_c": 0.7715217391304348, "calib/mu_w": 0.4477777777777778, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.17195121951219508, "calib/std_conf": 0.438842752624257, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5706896551724139, "calib/step_q_c_n": 667.0, "calib/step_q_gap": 0.15645236703682058, "calib/step_q_w": 0.4142372881355933, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2662.0, "completions/max_terminated_length": 2662.0, "completions/mean_length": 522.3359375, "completions/mean_terminated_length": 528.5296630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.1952, "grad_norm": 0.035424672067165375, "kl": 0.06060791015625, "learning_rate": 4.7222222222222226e-07, "loss": -0.0903, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.032264262437820435, "mask/share_reasoning": 0.8487693667411804, "mask/share_step_conf": 0.10724763572216034, "num_tokens": 43164640.0, "reward": 0.9020237326622009, "reward_std": 0.23391547799110413, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.687953531742096, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.816093921661377, "step": 183 }, { "adv/mean_abs_final_conf": 0.5525701642036438, "adv/mean_abs_reasoning": 0.48427891731262207, "adv/mean_abs_step_conf": 0.7636120915412903, "adv/ratio_final_to_reasoning": 1.1410163532824966, "adv/ratio_step_to_reasoning": 1.5768022605211762, "adv/std_final_conf": 0.7689260840415955, "adv/std_reasoning": 0.739284098148346, "adv/std_step_conf": 0.9334125518798828, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7401926782273603, "calib/avg_num_step_conf": 5.94140625, "calib/ece": 0.18125000000000002, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7096774193548387, "calib/gap": 0.41945818882466296, "calib/mean_conf": 0.7540725806451613, "calib/mu_c": 0.8809248554913297, "calib/mu_w": 0.46146666666666675, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11887096774193553, "calib/std_conf": 0.3948794563593181, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5471383975026014, "calib/step_q_c_n": 961.0, "calib/step_q_gap": 0.1796383975026014, "calib/step_q_w": 0.3675, "calib/step_q_w_n": 560.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2611.0, "completions/max_terminated_length": 2611.0, "completions/mean_length": 517.7890625, "completions/mean_terminated_length": 526.0079956054688, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.19626666666666667, "grad_norm": 0.032069843262434006, "kl": 0.0685272216796875, "learning_rate": 4.444444444444445e-07, "loss": -0.0207, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03259740397334099, "mask/share_reasoning": 0.8336936235427856, "mask/share_step_conf": 0.11808392405509949, "num_tokens": 43402474.0, "reward": 0.9748326539993286, "reward_std": 0.17759272456169128, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7816808223724365, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8382970094680786, "step": 184 }, { "adv/mean_abs_final_conf": 0.6131725311279297, "adv/mean_abs_reasoning": 0.48545408248901367, "adv/mean_abs_step_conf": 0.7590094804763794, "adv/ratio_final_to_reasoning": 1.2630906881740076, "adv/ratio_step_to_reasoning": 1.563504166212376, "adv/std_final_conf": 0.8436364531517029, "adv/std_reasoning": 0.7574914693832397, "adv/std_step_conf": 0.9346086978912354, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.8027830755103482, "calib/avg_num_step_conf": 6.23046875, "calib/ece": 0.19818181818181813, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6528925619834711, "calib/gap": 0.4702097902097901, "calib/mean_conf": 0.7096694214876034, "calib/mu_c": 0.9020279720279719, "calib/mu_w": 0.4318181818181818, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15847107438016522, "calib/std_conf": 0.42130735896474336, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5844903581267218, "calib/step_q_c_n": 726.0, "calib/step_q_gap": 0.26459392544547905, "calib/step_q_w": 0.31989643268124274, "calib/step_q_w_n": 869.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2878.0, "completions/max_terminated_length": 2878.0, "completions/mean_length": 519.1640625, "completions/mean_terminated_length": 538.0809936523438, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.19733333333333333, "grad_norm": 0.043537501245737076, "kl": 0.068511962890625, "learning_rate": 4.1666666666666667e-07, "loss": -0.1086, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.031707763671875, "mask/share_reasoning": 0.8220318555831909, "mask/share_step_conf": 0.11110415309667587, "num_tokens": 43642300.0, "reward": 0.9259820580482483, "reward_std": 0.22024211287498474, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7505718469619751, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8006108999252319, "step": 185 }, { "adv/mean_abs_final_conf": 0.623105525970459, "adv/mean_abs_reasoning": 0.5284816026687622, "adv/mean_abs_step_conf": 0.771432101726532, "adv/ratio_final_to_reasoning": 1.1790486609635198, "adv/ratio_step_to_reasoning": 1.4597142035425679, "adv/std_final_conf": 0.829008162021637, "adv/std_reasoning": 0.7576667070388794, "adv/std_step_conf": 0.9336704015731812, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7929896907216495, "calib/avg_num_step_conf": 5.52734375, "calib/ece": 0.19708502024291497, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6680161943319838, "calib/gap": 0.4486886597938145, "calib/mean_conf": 0.7161943319838056, "calib/mu_c": 0.8924000000000001, "calib/mu_w": 0.4437113402061856, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.15299595141700403, "calib/std_conf": 0.4141740850915179, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6124999999999999, "calib/step_q_c_n": 748.0, "calib/step_q_gap": 0.2139542728635681, "calib/step_q_w": 0.3985457271364318, "calib/step_q_w_n": 667.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2743.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 504.34765625, "completions/mean_terminated_length": 512.3532104492188, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.1984, "grad_norm": 0.035920802503824234, "kl": 0.06513214111328125, "learning_rate": 3.8888888888888895e-07, "loss": -0.1681, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03315415605902672, "mask/share_reasoning": 0.8398368954658508, "mask/share_step_conf": 0.11138398945331573, "num_tokens": 43876453.0, "reward": 0.9535995721817017, "reward_std": 0.24708446860313416, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7642765045166016, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8327664136886597, "step": 186 }, { "adv/mean_abs_final_conf": 0.6541886329650879, "adv/mean_abs_reasoning": 0.559043824672699, "adv/mean_abs_step_conf": 0.7574201822280884, "adv/ratio_final_to_reasoning": 1.1701920387871074, "adv/ratio_step_to_reasoning": 1.354849385325975, "adv/std_final_conf": 0.8290735483169556, "adv/std_reasoning": 0.7754475474357605, "adv/std_step_conf": 0.9339062571525574, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7392602206359506, "calib/avg_num_step_conf": 6.33984375, "calib/ece": 0.2710843373493975, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6506024096385542, "calib/gap": 0.35007852044127186, "calib/mean_conf": 0.7215261044176707, "calib/mu_c": 0.8832089552238805, "calib/mu_w": 0.5331304347826087, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22722891566265058, "calib/std_conf": 0.40691127714917147, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5349680715197956, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.14157283342455756, "calib/step_q_w": 0.3933952380952381, "calib/step_q_w_n": 840.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2927.0, "completions/max_terminated_length": 2927.0, "completions/mean_length": 536.58984375, "completions/mean_terminated_length": 545.107177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.19946666666666665, "grad_norm": 0.034603968262672424, "kl": 0.05792999267578125, "learning_rate": 3.611111111111111e-07, "loss": -0.1217, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.030966894701123238, "mask/share_reasoning": 0.8398405313491821, "mask/share_step_conf": 0.11356760561466217, "num_tokens": 44115364.0, "reward": 0.9120051860809326, "reward_std": 0.237847238779068, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7025140523910522, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8230587244033813, "step": 187 }, { "adv/mean_abs_final_conf": 0.6000089645385742, "adv/mean_abs_reasoning": 0.3837595582008362, "adv/mean_abs_step_conf": 0.7395734190940857, "adv/ratio_final_to_reasoning": 1.5635023329492328, "adv/ratio_step_to_reasoning": 1.9271791497816932, "adv/std_final_conf": 0.8135210275650024, "adv/std_reasoning": 0.6815720796585083, "adv/std_step_conf": 0.9343740344047546, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7540782199167293, "calib/avg_num_step_conf": 5.90234375, "calib/ece": 0.22043650793650793, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6706349206349206, "calib/gap": 0.39143334925943635, "calib/mean_conf": 0.7353571428571428, "calib/mu_c": 0.8767080745341617, "calib/mu_w": 0.4852747252747253, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15845238095238096, "calib/std_conf": 0.40426922045551494, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5919388954171563, "calib/step_q_c_n": 851.0, "calib/step_q_gap": 0.18362071359897442, "calib/step_q_w": 0.40831818181818186, "calib/step_q_w_n": 660.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2669.0, "completions/max_terminated_length": 2669.0, "completions/mean_length": 557.171875, "completions/mean_terminated_length": 559.3568725585938, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.20053333333333334, "grad_norm": 0.03549211099743843, "kl": 0.06658172607421875, "learning_rate": 3.3333333333333335e-07, "loss": 0.0939, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03242797404527664, "mask/share_reasoning": 0.8466348648071289, "mask/share_step_conf": 0.11703091114759445, "num_tokens": 44362072.0, "reward": 0.9675827622413635, "reward_std": 0.17826011776924133, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7650222778320312, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8474870324134827, "step": 188 }, { "adv/mean_abs_final_conf": 0.5870659947395325, "adv/mean_abs_reasoning": 0.406009316444397, "adv/mean_abs_step_conf": 0.7587201595306396, "adv/ratio_final_to_reasoning": 1.4459421766001057, "adv/ratio_step_to_reasoning": 1.868725984356929, "adv/std_final_conf": 0.7885566353797913, "adv/std_reasoning": 0.6816694736480713, "adv/std_step_conf": 0.9334494471549988, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7710961877628544, "calib/avg_num_step_conf": 5.33984375, "calib/ece": 0.25399209486166, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.549407114624506, "calib/gap": 0.38102428435761776, "calib/mean_conf": 0.6072727272727272, "calib/mu_c": 0.744320987654321, "calib/mu_w": 0.3632967032967032, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11047430830039523, "calib/std_conf": 0.45210101612569953, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.524469135802469, "calib/step_q_c_n": 810.0, "calib/step_q_gap": 0.2071441806857724, "calib/step_q_w": 0.3173249551166966, "calib/step_q_w_n": 557.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 499.375, "completions/mean_terminated_length": 501.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.2016, "grad_norm": 0.03689796105027199, "kl": 0.07184600830078125, "learning_rate": 3.055555555555556e-07, "loss": 0.1004, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03577050566673279, "mask/share_reasoning": 0.8457111716270447, "mask/share_step_conf": 0.11461208760738373, "num_tokens": 44597680.0, "reward": 0.953514575958252, "reward_std": 0.16989631950855255, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.731041431427002, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8517688512802124, "step": 189 }, { "adv/mean_abs_final_conf": 0.6480185985565186, "adv/mean_abs_reasoning": 0.45343148708343506, "adv/mean_abs_step_conf": 0.717402458190918, "adv/ratio_final_to_reasoning": 1.4291433590655735, "adv/ratio_step_to_reasoning": 1.5821628594992347, "adv/std_final_conf": 0.8678444623947144, "adv/std_reasoning": 0.7392775416374207, "adv/std_step_conf": 0.9342573881149292, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8118368700265252, "calib/avg_num_step_conf": 6.41796875, "calib/ece": 0.20867469879518075, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5542168674698795, "calib/gap": 0.4598554376657826, "calib/mean_conf": 0.6195180722891566, "calib/mu_c": 0.8115862068965518, "calib/mu_w": 0.35173076923076924, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12293172690763056, "calib/std_conf": 0.4409607171745155, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5320570749108205, "calib/step_q_c_n": 841.0, "calib/step_q_gap": 0.14219423201805242, "calib/step_q_w": 0.3898628428927681, "calib/step_q_w_n": 802.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2585.0, "completions/max_terminated_length": 2585.0, "completions/mean_length": 562.12890625, "completions/mean_terminated_length": 575.6200561523438, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.20266666666666666, "grad_norm": 0.03430723026394844, "kl": 0.05941009521484375, "learning_rate": 2.7777777777777776e-07, "loss": -0.1591, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.028694037348031998, "mask/share_reasoning": 0.8382810354232788, "mask/share_step_conf": 0.10958744585514069, "num_tokens": 44847193.0, "reward": 0.9562456607818604, "reward_std": 0.2037629783153534, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7631875276565552, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.841491162776947, "step": 190 }, { "adv/mean_abs_final_conf": 0.6388992071151733, "adv/mean_abs_reasoning": 0.4522554874420166, "adv/mean_abs_step_conf": 0.7552422285079956, "adv/ratio_final_to_reasoning": 1.4126953123970358, "adv/ratio_step_to_reasoning": 1.6699459696546517, "adv/std_final_conf": 0.8288201689720154, "adv/std_reasoning": 0.7392417788505554, "adv/std_step_conf": 0.9342796802520752, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7233987603305785, "calib/avg_num_step_conf": 6.33203125, "calib/ece": 0.27963855421686745, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6626506024096386, "calib/gap": 0.34462293388429754, "calib/mean_conf": 0.7169076305220884, "calib/mu_c": 0.884375, "calib/mu_w": 0.5397520661157025, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24124497991967872, "calib/std_conf": 0.4155091659276454, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5751640513552069, "calib/step_q_c_n": 701.0, "calib/step_q_gap": 0.16002274700738078, "calib/step_q_w": 0.4151413043478261, "calib/step_q_w_n": 920.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 492.35546875, "completions/mean_terminated_length": 502.1633605957031, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.20373333333333332, "grad_norm": 0.055539507418870926, "kl": 0.0661163330078125, "learning_rate": 2.5000000000000004e-07, "loss": -0.0214, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03583353012800217, "mask/share_reasoning": 0.8158671855926514, "mask/share_step_conf": 0.12876802682876587, "num_tokens": 45077404.0, "reward": 0.9033346176147461, "reward_std": 0.18452668190002441, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6892011761665344, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8229367733001709, "step": 191 }, { "adv/mean_abs_final_conf": 0.63950514793396, "adv/mean_abs_reasoning": 0.5159052610397339, "adv/mean_abs_step_conf": 0.7172884345054626, "adv/ratio_final_to_reasoning": 1.2395786517957348, "adv/ratio_step_to_reasoning": 1.3903491370874363, "adv/std_final_conf": 0.849496603012085, "adv/std_reasoning": 0.7755297422409058, "adv/std_step_conf": 0.9343807697296143, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8191850006918501, "calib/avg_num_step_conf": 5.84375, "calib/ece": 0.18514285714285714, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6244897959183674, "calib/gap": 0.5007395876573959, "calib/mean_conf": 0.6875918367346939, "calib/mu_c": 0.8899315068493151, "calib/mu_w": 0.3891919191919192, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1384081632653061, "calib/std_conf": 0.4220072500763874, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5468974700399468, "calib/step_q_c_n": 751.0, "calib/step_q_gap": 0.20658874520773202, "calib/step_q_w": 0.3403087248322148, "calib/step_q_w_n": 745.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2460.0, "completions/max_terminated_length": 2460.0, "completions/mean_length": 513.36328125, "completions/mean_terminated_length": 527.795166015625, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.2048, "grad_norm": 0.03801960498094559, "kl": 0.06374359130859375, "learning_rate": 2.2222222222222224e-07, "loss": -0.1241, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.034386202692985535, "mask/share_reasoning": 0.8210378885269165, "mask/share_step_conf": 0.11723221093416214, "num_tokens": 45313801.0, "reward": 0.9600014686584473, "reward_std": 0.2434854954481125, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7788914442062378, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8356428146362305, "step": 192 }, { "adv/mean_abs_final_conf": 0.6938143968582153, "adv/mean_abs_reasoning": 0.6675748825073242, "adv/mean_abs_step_conf": 0.7467559576034546, "adv/ratio_final_to_reasoning": 1.0393057244040382, "adv/ratio_step_to_reasoning": 1.118610027385597, "adv/std_final_conf": 0.8796382546424866, "adv/std_reasoning": 0.87473464012146, "adv/std_step_conf": 0.9342511296272278, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7879343629343628, "calib/avg_num_step_conf": 5.59765625, "calib/ece": 0.25896414342629476, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6374501992031872, "calib/gap": 0.3452612612612612, "calib/mean_conf": 0.7113147410358567, "calib/mu_c": 0.8640000000000001, "calib/mu_w": 0.5187387387387389, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2062549800796812, "calib/std_conf": 0.4062365713892538, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.55, "calib/step_q_c_n": 716.0, "calib/step_q_gap": 0.1575732217573222, "calib/step_q_w": 0.39242677824267785, "calib/step_q_w_n": 717.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 517.73828125, "completions/mean_terminated_length": 521.81494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 210.0, "epoch": 0.20586666666666667, "grad_norm": 0.06238226965069771, "kl": 0.0617523193359375, "learning_rate": 1.9444444444444447e-07, "loss": 0.0348, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.031888388097286224, "mask/share_reasoning": 0.8456735610961914, "mask/share_step_conf": 0.11462554335594177, "num_tokens": 45552054.0, "reward": 0.9268547296524048, "reward_std": 0.25273197889328003, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7135710716247559, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8370133638381958, "step": 193 }, { "adv/mean_abs_final_conf": 0.6090821623802185, "adv/mean_abs_reasoning": 0.41220623254776, "adv/mean_abs_step_conf": 0.7578562498092651, "adv/ratio_final_to_reasoning": 1.4776151214784157, "adv/ratio_step_to_reasoning": 1.8385366109704724, "adv/std_final_conf": 0.8400105237960815, "adv/std_reasoning": 0.7012957334518433, "adv/std_step_conf": 0.9326762557029724, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8238798238798238, "calib/avg_num_step_conf": 5.2421875, "calib/ece": 0.20282868525896425, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6334661354581673, "calib/gap": 0.4949307174307173, "calib/mean_conf": 0.6903984063745021, "calib/mu_c": 0.9033566433566432, "calib/mu_w": 0.4084259259259259, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16175298804780885, "calib/std_conf": 0.43141091678464316, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5734880636604774, "calib/step_q_c_n": 754.0, "calib/step_q_gap": 0.21416833576932093, "calib/step_q_w": 0.3593197278911565, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2750.0, "completions/max_terminated_length": 2750.0, "completions/mean_length": 488.0234375, "completions/mean_terminated_length": 489.9372863769531, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.20693333333333333, "grad_norm": 0.0440903939306736, "kl": 0.06768798828125, "learning_rate": 1.6666666666666668e-07, "loss": 0.0089, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035178959369659424, "mask/share_reasoning": 0.8498561978340149, "mask/share_step_conf": 0.11105857044458389, "num_tokens": 45782932.0, "reward": 0.975521445274353, "reward_std": 0.20822137594223022, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7812730073928833, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8611760139465332, "step": 194 }, { "adv/mean_abs_final_conf": 0.653519868850708, "adv/mean_abs_reasoning": 0.554317831993103, "adv/mean_abs_step_conf": 0.7447904348373413, "adv/ratio_final_to_reasoning": 1.1789623770552617, "adv/ratio_step_to_reasoning": 1.3436162285441473, "adv/std_final_conf": 0.8451783061027527, "adv/std_reasoning": 0.7929887771606445, "adv/std_step_conf": 0.9340994358062744, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7995283018867925, "calib/avg_num_step_conf": 5.9609375, "calib/ece": 0.20425000000000001, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5666666666666667, "calib/gap": 0.5018276541819207, "calib/mean_conf": 0.6285833333333334, "calib/mu_c": 0.8502238805970149, "calib/mu_w": 0.3483962264150943, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.13725, "calib/std_conf": 0.44864851839224384, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5661994219653179, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.23670301908762015, "calib/step_q_w": 0.32949640287769777, "calib/step_q_w_n": 834.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3007.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 520.796875, "completions/mean_terminated_length": 531.1713256835938, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.208, "grad_norm": 0.037474848330020905, "kl": 0.0653839111328125, "learning_rate": 1.3888888888888888e-07, "loss": -0.0751, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.032066501677036285, "mask/share_reasoning": 0.8334785103797913, "mask/share_step_conf": 0.11492373049259186, "num_tokens": 46022240.0, "reward": 0.917759895324707, "reward_std": 0.2372083067893982, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7431319952011108, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.8009814023971558, "step": 195 }, { "adv/mean_abs_final_conf": 0.521324634552002, "adv/mean_abs_reasoning": 0.34145301580429077, "adv/mean_abs_step_conf": 0.7322190403938293, "adv/ratio_final_to_reasoning": 1.526782925973064, "adv/ratio_step_to_reasoning": 2.1444210667435204, "adv/std_final_conf": 0.7769902348518372, "adv/std_reasoning": 0.640144407749176, "adv/std_step_conf": 0.9338326454162598, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.725809470335018, "calib/avg_num_step_conf": 4.80859375, "calib/ece": 0.2730314960629922, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7362204724409449, "calib/gap": 0.3394616008484622, "calib/mean_conf": 0.7872834645669291, "calib/mu_c": 0.9436496350364963, "calib/mu_w": 0.6041880341880341, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.260472440944882, "calib/std_conf": 0.3682905049356989, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6186495176848875, "calib/step_q_c_n": 622.0, "calib/step_q_gap": 0.11920780996731767, "calib/step_q_w": 0.49944170771756985, "calib/step_q_w_n": 609.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 411.95703125, "completions/mean_terminated_length": 413.57257080078125, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.20906666666666668, "grad_norm": 0.04654751718044281, "kl": 0.07309722900390625, "learning_rate": 1.1111111111111112e-07, "loss": -0.0278, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03892698884010315, "mask/share_reasoning": 0.8334881067276001, "mask/share_step_conf": 0.12367869168519974, "num_tokens": 46230245.0, "reward": 0.9221498370170593, "reward_std": 0.17452925443649292, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7174800634384155, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8213508129119873, "step": 196 }, { "adv/mean_abs_final_conf": 0.6606262922286987, "adv/mean_abs_reasoning": 0.5360690951347351, "adv/mean_abs_step_conf": 0.7593021392822266, "adv/ratio_final_to_reasoning": 1.2323528780607238, "adv/ratio_step_to_reasoning": 1.416425879002378, "adv/std_final_conf": 0.8487818241119385, "adv/std_reasoning": 0.7928498983383179, "adv/std_step_conf": 0.9338219165802002, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7951929365598431, "calib/avg_num_step_conf": 5.875, "calib/ece": 0.22253012048192775, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5863453815261044, "calib/gap": 0.39522890778286446, "calib/mean_conf": 0.6810843373493976, "calib/mu_c": 0.85568345323741, "calib/mu_w": 0.46045454545454556, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17269076305220887, "calib/std_conf": 0.40987904587163415, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5547791164658634, "calib/step_q_c_n": 747.0, "calib/step_q_gap": 0.14450170563363085, "calib/step_q_w": 0.41027741083223257, "calib/step_q_w_n": 757.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2476.0, "completions/max_terminated_length": 2476.0, "completions/mean_length": 527.15234375, "completions/mean_terminated_length": 533.4031982421875, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.21013333333333334, "grad_norm": 0.04660060629248619, "kl": 0.0655059814453125, "learning_rate": 8.333333333333334e-08, "loss": 0.0005, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032303206622600555, "mask/share_reasoning": 0.8369505405426025, "mask/share_step_conf": 0.1190275102853775, "num_tokens": 46470252.0, "reward": 0.9414454698562622, "reward_std": 0.20183053612709045, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7443073987960815, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8354585766792297, "step": 197 }, { "adv/mean_abs_final_conf": 0.5518897175788879, "adv/mean_abs_reasoning": 0.46058160066604614, "adv/mean_abs_step_conf": 0.7634322047233582, "adv/ratio_final_to_reasoning": 1.1982452550879177, "adv/ratio_step_to_reasoning": 1.6575395187722661, "adv/std_final_conf": 0.7794657945632935, "adv/std_reasoning": 0.7206063270568848, "adv/std_step_conf": 0.9341621994972229, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8199675324675324, "calib/avg_num_step_conf": 5.7734375, "calib/ece": 0.17019685039370086, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5354330708661418, "calib/gap": 0.5053181818181819, "calib/mean_conf": 0.6128740157480316, "calib/mu_c": 0.8118181818181819, "calib/mu_w": 0.3065, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08838582677165359, "calib/std_conf": 0.440503478801961, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.559171528588098, "calib/step_q_c_n": 857.0, "calib/step_q_gap": 0.1234710454963106, "calib/step_q_w": 0.43570048309178744, "calib/step_q_w_n": 621.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2915.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 488.0703125, "completions/mean_terminated_length": 488.0703125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.2112, "grad_norm": 0.055203624069690704, "kl": 0.0720977783203125, "learning_rate": 5.555555555555556e-08, "loss": 0.1019, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03624938800930977, "mask/share_reasoning": 0.8318265676498413, "mask/share_step_conf": 0.131924107670784, "num_tokens": 46700582.0, "reward": 0.9913797378540039, "reward_std": 0.146388441324234, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.8021363019943237, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8618730902671814, "step": 198 }, { "adv/mean_abs_final_conf": 0.6371850967407227, "adv/mean_abs_reasoning": 0.4301682710647583, "adv/mean_abs_step_conf": 0.7736830711364746, "adv/ratio_final_to_reasoning": 1.4812461531938501, "adv/ratio_step_to_reasoning": 1.7985591294807584, "adv/std_final_conf": 0.8231662511825562, "adv/std_reasoning": 0.701428234577179, "adv/std_step_conf": 0.9334186911582947, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7222439660795825, "calib/avg_num_step_conf": 5.66796875, "calib/ece": 0.2613147410358567, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6733067729083665, "calib/gap": 0.30401630789302025, "calib/mean_conf": 0.7252191235059761, "calib/mu_c": 0.8523972602739727, "calib/mu_w": 0.5483809523809524, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20243027888446233, "calib/std_conf": 0.4011063531864865, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5576440460947503, "calib/step_q_c_n": 781.0, "calib/step_q_gap": 0.1511962849007204, "calib/step_q_w": 0.40644776119402987, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2966.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 543.89453125, "completions/mean_terminated_length": 548.1771850585938, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.21226666666666666, "grad_norm": 0.036017391830682755, "kl": 0.0663299560546875, "learning_rate": 2.777777777777778e-08, "loss": -0.0326, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034367334097623825, "mask/share_reasoning": 0.8351020812988281, "mask/share_step_conf": 0.12271807342767715, "num_tokens": 46944019.0, "reward": 0.9380004405975342, "reward_std": 0.20820194482803345, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7090073823928833, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8568372130393982, "step": 199 }, { "adv/mean_abs_final_conf": 0.5297156572341919, "adv/mean_abs_reasoning": 0.44187745451927185, "adv/mean_abs_step_conf": 0.764366626739502, "adv/ratio_final_to_reasoning": 1.1987840787452737, "adv/ratio_step_to_reasoning": 1.7298158548755858, "adv/std_final_conf": 0.7672268748283386, "adv/std_reasoning": 0.7205674648284912, "adv/std_step_conf": 0.9337111115455627, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8338223717168826, "calib/avg_num_step_conf": 5.44140625, "calib/ece": 0.19406504065040642, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6463414634146342, "calib/gap": 0.496431689687254, "calib/mean_conf": 0.6830081300813008, "calib/mu_c": 0.8626114649681529, "calib/mu_w": 0.3661797752808989, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.119430894308943, "calib/std_conf": 0.4381992622424606, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5943325242718447, "calib/step_q_c_n": 824.0, "calib/step_q_gap": 0.27080001109082535, "calib/step_q_w": 0.32353251318101933, "calib/step_q_w_n": 569.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2950.0, "completions/max_terminated_length": 2950.0, "completions/mean_length": 542.28515625, "completions/mean_terminated_length": 548.7154541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.21333333333333335, "grad_norm": 0.03856263682246208, "kl": 0.0735321044921875, "learning_rate": 0.0, "loss": -0.0412, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03279662877321243, "mask/share_reasoning": 0.8446874618530273, "mask/share_step_conf": 0.11079715937376022, "num_tokens": 47190892.0, "reward": 0.9715377688407898, "reward_std": 0.18524646759033203, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7729078531265259, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8545427322387695, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.02450802539009601, "train_runtime": 14799.4515, "train_samples_per_second": 3.46, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 47190892, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }