{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.47760647535324097, "adv/mean_abs_reasoning": 0.4569147527217865, "adv/mean_abs_step_conf": 0.7568557262420654, "adv/ratio_final_to_reasoning": 1.0452857398632815, "adv/ratio_step_to_reasoning": 1.6564484331783258, "adv/std_final_conf": 0.7227410674095154, "adv/std_reasoning": 0.7206857204437256, "adv/std_step_conf": 0.9254623055458069, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5086206896551725, "calib/avg_num_step_conf": 7.875, "calib/ece": 0.2888991935483871, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001713264989126051, "calib/mean_conf": 0.9905120967741936, "calib/mu_c": 0.9905632183908043, "calib/mu_w": 0.9903918918918917, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2888991935483871, "calib/std_conf": 0.0021794159006610276, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9119477557027226, "calib/step_q_c_n": 1359.0, "calib/step_q_gap": 0.0056311651395566376, "calib/step_q_w": 0.9063165905631659, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 755.49609375, "completions/mean_terminated_length": 776.7349243164062, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.0010666666666666667, "grad_norm": 0.02502845786511898, "kl": 0.0005849599838256836, "learning_rate": 2.5000000000000004e-07, "loss": -0.1291, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018737709149718285, "mask/share_reasoning": 0.845859944820404, "mask/share_step_conf": 0.10805858671665192, "num_tokens": 300991.0, "reward": 0.8747976422309875, "reward_std": 0.23758892714977264, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6851503849029541, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7347574234008789, "step": 1 }, { "adv/mean_abs_final_conf": 0.437887966632843, "adv/mean_abs_reasoning": 0.4207462966442108, "adv/mean_abs_step_conf": 0.7215397357940674, "adv/ratio_final_to_reasoning": 1.0407411072310102, "adv/ratio_step_to_reasoning": 1.7149045435430461, "adv/std_final_conf": 0.6832791566848755, "adv/std_reasoning": 0.6817297339439392, "adv/std_step_conf": 0.9222810864448547, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4872611464968153, "calib/avg_num_step_conf": 7.6953125, "calib/ece": 0.36465737051792824, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00024481637078155316, "calib/mean_conf": 0.9901553784860557, "calib/mu_c": 0.990063694267516, "calib/mu_w": 0.9903085106382975, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36465737051792824, "calib/std_conf": 0.001222205307190084, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9075405636208369, "calib/step_q_c_n": 1171.0, "calib/step_q_gap": -0.003804868168900244, "calib/step_q_w": 0.9113454317897371, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2743.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 840.640625, "completions/mean_terminated_length": 850.6087036132812, "completions/min_length": 0.0, "completions/min_terminated_length": 466.0, "epoch": 0.0021333333333333334, "grad_norm": 0.021789954975247383, "kl": 0.0016820430755615234, "learning_rate": 5.000000000000001e-07, "loss": -0.0097, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01788702979683876, "mask/share_reasoning": 0.8706268668174744, "mask/share_step_conf": 0.09976735711097717, "num_tokens": 619483.0, "reward": 0.818764328956604, "reward_std": 0.2176743745803833, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6203019618988037, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6984766721725464, "step": 2 }, { "adv/mean_abs_final_conf": 0.3907315135002136, "adv/mean_abs_reasoning": 0.37838390469551086, "adv/mean_abs_step_conf": 0.7500925660133362, "adv/ratio_final_to_reasoning": 1.0326324895205017, "adv/ratio_step_to_reasoning": 1.9823585430171582, "adv/std_final_conf": 0.6628881096839905, "adv/std_reasoning": 0.6613768935203552, "adv/std_step_conf": 0.9263537526130676, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4993700904105529, "calib/avg_num_step_conf": 7.734375, "calib/ece": 0.30087649402390426, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.2598191788670832e-05, "calib/mean_conf": 0.9901195219123505, "calib/mu_c": 0.990115606936416, "calib/mu_w": 0.9901282051282047, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30087649402390426, "calib/std_conf": 0.001086707704939113, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9121627906976743, "calib/step_q_c_n": 1290.0, "calib/step_q_gap": -0.00253286147623899, "calib/step_q_w": 0.9146956521739132, "calib/step_q_w_n": 690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1638.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 786.2890625, "completions/mean_terminated_length": 801.9522094726562, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.0032, "grad_norm": 0.022982632741332054, "kl": 0.0005571246147155762, "learning_rate": 7.5e-07, "loss": -0.0907, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018434403464198112, "mask/share_reasoning": 0.8618663549423218, "mask/share_step_conf": 0.10016795992851257, "num_tokens": 926029.0, "reward": 0.8705581426620483, "reward_std": 0.2126719355583191, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6816222667694092, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7290253043174744, "step": 3 }, { "adv/mean_abs_final_conf": 0.49821069836616516, "adv/mean_abs_reasoning": 0.48443976044654846, "adv/mean_abs_step_conf": 0.776309609413147, "adv/ratio_final_to_reasoning": 1.028426522849658, "adv/ratio_step_to_reasoning": 1.6024894585398146, "adv/std_final_conf": 0.7398562431335449, "adv/std_reasoning": 0.7394483685493469, "adv/std_step_conf": 0.9243927597999573, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5064102564102564, "calib/avg_num_step_conf": 7.61328125, "calib/ece": 0.30415322580645165, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9959677419354839, "calib/gap": 0.001153846153846727, "calib/mean_conf": 0.9896370967741935, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9888461538461534, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30415322580645165, "calib/std_conf": 0.005703471887480457, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9096366508688782, "calib/step_q_c_n": 1266.0, "calib/step_q_gap": -0.002105662454694235, "calib/step_q_w": 0.9117423133235725, "calib/step_q_w_n": 683.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2882.0, "completions/max_terminated_length": 2882.0, "completions/mean_length": 810.3046875, "completions/mean_terminated_length": 819.9130859375, "completions/min_length": 0.0, "completions/min_terminated_length": 471.0, "epoch": 0.004266666666666667, "grad_norm": 0.027756016701459885, "kl": 0.0005552768707275391, "learning_rate": 1.0000000000000002e-06, "loss": -0.0467, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018273094668984413, "mask/share_reasoning": 0.8700456619262695, "mask/share_step_conf": 0.0999625027179718, "num_tokens": 1239635.0, "reward": 0.8520616292953491, "reward_std": 0.26222509145736694, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.6667401790618896, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7116017937660217, "step": 4 }, { "adv/mean_abs_final_conf": 0.43325841426849365, "adv/mean_abs_reasoning": 0.43388810753822327, "adv/mean_abs_step_conf": 0.7429900765419006, "adv/ratio_final_to_reasoning": 0.9985487196842008, "adv/ratio_step_to_reasoning": 1.7124001871299206, "adv/std_final_conf": 0.7010958194732666, "adv/std_reasoning": 0.7014488577842712, "adv/std_step_conf": 0.9139078855514526, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.49477886977886976, "calib/avg_num_step_conf": 7.58984375, "calib/ece": 0.44691358024691363, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00010442260442244944, "calib/mean_conf": 0.9901234567901235, "calib/mu_c": 0.9900757575757576, "calib/mu_w": 0.9901801801801801, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44691358024691363, "calib/std_conf": 0.0011042310999998973, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9138218390804598, "calib/step_q_c_n": 1044.0, "calib/step_q_gap": 0.009327956989247377, "calib/step_q_w": 0.9044938820912124, "calib/step_q_w_n": 899.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3016.0, "completions/max_terminated_length": 3016.0, "completions/mean_length": 816.8515625, "completions/mean_terminated_length": 833.12353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 435.0, "epoch": 0.005333333333333333, "grad_norm": 0.02338205836713314, "kl": 0.000583648681640625, "learning_rate": 1.25e-06, "loss": -0.076, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.018079116940498352, "mask/share_reasoning": 0.8604636192321777, "mask/share_step_conf": 0.1019260585308075, "num_tokens": 1555437.0, "reward": 0.716595470905304, "reward_std": 0.2071603238582611, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5240468978881836, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6161753535270691, "step": 5 }, { "adv/mean_abs_final_conf": 0.3801310658454895, "adv/mean_abs_reasoning": 0.37981629371643066, "adv/mean_abs_step_conf": 0.7121192216873169, "adv/ratio_final_to_reasoning": 1.0008287483561562, "adv/ratio_step_to_reasoning": 1.8749043510465675, "adv/std_final_conf": 0.6808795928955078, "adv/std_reasoning": 0.6816310286521912, "adv/std_step_conf": 0.9247093796730042, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.49514563106796117, "calib/avg_num_step_conf": 8.0703125, "calib/ece": 0.4070445344129554, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -9.708737864055905e-05, "calib/mean_conf": 0.9900404858299594, "calib/mu_c": 0.99, "calib/mu_w": 0.9900970873786406, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4070445344129554, "calib/std_conf": 0.0006349954308242161, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9109003496503498, "calib/step_q_c_n": 1144.0, "calib/step_q_gap": -0.0015399974212336742, "calib/step_q_w": 0.9124403470715835, "calib/step_q_w_n": 922.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2505.0, "completions/max_terminated_length": 2505.0, "completions/mean_length": 722.78515625, "completions/mean_terminated_length": 743.1043701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 474.0, "epoch": 0.0064, "grad_norm": 0.017047492787241936, "kl": 0.0007506012916564941, "learning_rate": 1.5e-06, "loss": -0.1258, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01940595731139183, "mask/share_reasoning": 0.842772364616394, "mask/share_step_conf": 0.11047791689634323, "num_tokens": 1846422.0, "reward": 0.7581819295883179, "reward_std": 0.19750076532363892, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5703726410865784, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6405224800109863, "step": 6 }, { "adv/mean_abs_final_conf": 0.47522249817848206, "adv/mean_abs_reasoning": 0.44085246324539185, "adv/mean_abs_step_conf": 0.7773077487945557, "adv/ratio_final_to_reasoning": 1.07796266959715, "adv/ratio_step_to_reasoning": 1.763192481839174, "adv/std_final_conf": 0.7201507091522217, "adv/std_reasoning": 0.7015153765678406, "adv/std_step_conf": 0.9311604499816895, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4850123385106692, "calib/avg_num_step_conf": 7.453125, "calib/ece": 0.3234859437751004, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00027710843373507377, "calib/mean_conf": 0.9901526104417671, "calib/mu_c": 0.9900602409638553, "calib/mu_w": 0.9903373493975903, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3234859437751004, "calib/std_conf": 0.001196045699004783, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9102295081967213, "calib/step_q_c_n": 1220.0, "calib/step_q_gap": 0.006629217499046991, "calib/step_q_w": 0.9036002906976743, "calib/step_q_w_n": 688.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 827.58984375, "completions/mean_terminated_length": 844.0757446289062, "completions/min_length": 0.0, "completions/min_terminated_length": 483.0, "epoch": 0.007466666666666667, "grad_norm": 0.018745925277471542, "kl": 0.0005162954330444336, "learning_rate": 1.75e-06, "loss": -0.0747, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.017496969550848007, "mask/share_reasoning": 0.8684599995613098, "mask/share_step_conf": 0.09451176226139069, "num_tokens": 2165709.0, "reward": 0.8448113203048706, "reward_std": 0.22868016362190247, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6545296311378479, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7116554379463196, "step": 7 }, { "adv/mean_abs_final_conf": 0.39101529121398926, "adv/mean_abs_reasoning": 0.3464837372303009, "adv/mean_abs_step_conf": 0.7708368897438049, "adv/ratio_final_to_reasoning": 1.128524225522565, "adv/ratio_step_to_reasoning": 2.224741905365229, "adv/std_final_conf": 0.6386479139328003, "adv/std_reasoning": 0.6186771988868713, "adv/std_step_conf": 0.9308523535728455, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.48971253465635484, "calib/avg_num_step_conf": 7.4765625, "calib/ece": 0.35645679012345677, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00019597256675907282, "calib/mean_conf": 0.990201646090535, "calib/mu_c": 0.9901298701298701, "calib/mu_w": 0.9903258426966292, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35645679012345677, "calib/std_conf": 0.0013923946001348608, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9107012987012988, "calib/step_q_c_n": 1155.0, "calib/step_q_gap": 0.019510257858083913, "calib/step_q_w": 0.8911910408432149, "calib/step_q_w_n": 759.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2407.0, "completions/max_terminated_length": 2407.0, "completions/mean_length": 850.4921875, "completions/mean_terminated_length": 867.4342651367188, "completions/min_length": 0.0, "completions/min_terminated_length": 434.0, "epoch": 0.008533333333333334, "grad_norm": 0.025057844817638397, "kl": 0.0005748271942138672, "learning_rate": 2.0000000000000003e-06, "loss": -0.0931, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.017516685649752617, "mask/share_reasoning": 0.8684801459312439, "mask/share_step_conf": 0.0944719910621643, "num_tokens": 2489947.0, "reward": 0.8075144290924072, "reward_std": 0.1630677580833435, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6081961393356323, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6966763734817505, "step": 8 }, { "adv/mean_abs_final_conf": 0.43854090571403503, "adv/mean_abs_reasoning": 0.3960496783256531, "adv/mean_abs_step_conf": 0.7646069526672363, "adv/ratio_final_to_reasoning": 1.1072876199976192, "adv/ratio_step_to_reasoning": 1.930583445742723, "adv/std_final_conf": 0.6832506060600281, "adv/std_reasoning": 0.6614897847175598, "adv/std_step_conf": 0.9325852990150452, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.4974292072749597, "calib/avg_num_step_conf": 8.27734375, "calib/ece": 0.3319291666666666, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.9958333333333333, "calib/gap": 0.011749597114573018, "calib/mean_conf": 0.9860958333333334, "calib/mu_c": 0.9901592356687897, "calib/mu_w": 0.9784096385542167, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3319291666666666, "calib/std_conf": 0.06380075743520255, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9100164338537388, "calib/step_q_c_n": 1217.0, "calib/step_q_gap": -0.005450306722758036, "calib/step_q_w": 0.9154667405764968, "calib/step_q_w_n": 902.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2320.0, "completions/max_terminated_length": 2320.0, "completions/mean_length": 760.39453125, "completions/mean_terminated_length": 801.0740356445312, "completions/min_length": 0.0, "completions/min_terminated_length": 460.0, "epoch": 0.0096, "grad_norm": 0.01747042126953602, "kl": 0.0006932616233825684, "learning_rate": 2.25e-06, "loss": -0.2195, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017814621329307556, "mask/share_reasoning": 0.8285001516342163, "mask/share_step_conf": 0.10290395468473434, "num_tokens": 2792144.0, "reward": 0.79695063829422, "reward_std": 0.23099634051322937, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.623283863067627, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.6604610681533813, "step": 9 }, { "adv/mean_abs_final_conf": 0.4888319671154022, "adv/mean_abs_reasoning": 0.4622969925403595, "adv/mean_abs_step_conf": 0.7920699119567871, "adv/ratio_final_to_reasoning": 1.0573981120431497, "adv/ratio_step_to_reasoning": 1.713335636479698, "adv/std_final_conf": 0.7395802140235901, "adv/std_reasoning": 0.7206360697746277, "adv/std_step_conf": 0.9307116866111755, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5014992503748127, "calib/avg_num_step_conf": 7.44921875, "calib/ece": 0.3411612903225807, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.312629399632172e-05, "calib/mean_conf": 0.9903548387096774, "calib/mu_c": 0.9903664596273293, "calib/mu_w": 0.990333333333333, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3411612903225807, "calib/std_conf": 0.0018302720949977284, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9100284280936453, "calib/step_q_c_n": 1196.0, "calib/step_q_gap": 0.006634616560593054, "calib/step_q_w": 0.9033938115330522, "calib/step_q_w_n": 711.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3047.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 827.7421875, "completions/mean_terminated_length": 840.8809814453125, "completions/min_length": 0.0, "completions/min_terminated_length": 442.0, "epoch": 0.010666666666666666, "grad_norm": 0.025133531540632248, "kl": 0.0028550028800964355, "learning_rate": 2.5e-06, "loss": -0.0486, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.017763039097189903, "mask/share_reasoning": 0.8690170645713806, "mask/share_step_conf": 0.09759492427110672, "num_tokens": 3110846.0, "reward": 0.8297994136810303, "reward_std": 0.23124775290489197, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6353831887245178, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7046843767166138, "step": 10 }, { "adv/mean_abs_final_conf": 0.4417019486427307, "adv/mean_abs_reasoning": 0.4121132791042328, "adv/mean_abs_step_conf": 0.7525558471679688, "adv/ratio_final_to_reasoning": 1.07179741842537, "adv/ratio_step_to_reasoning": 1.8260897799840863, "adv/std_final_conf": 0.7013117074966431, "adv/std_reasoning": 0.6817113161087036, "adv/std_step_conf": 0.9160208702087402, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4804754993434239, "calib/avg_num_step_conf": 7.4140625, "calib/ece": 0.35407200000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0003705853894533373, "calib/mean_conf": 0.9900720000000001, "calib/mu_c": 0.9899371069182389, "calib/mu_w": 0.9903076923076922, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35407200000000005, "calib/std_conf": 0.0012044982357811913, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9084249384741593, "calib/step_q_c_n": 1219.0, "calib/step_q_gap": 0.002796072494777757, "calib/step_q_w": 0.9056288659793815, "calib/step_q_w_n": 679.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2120.0, "completions/max_terminated_length": 2120.0, "completions/mean_length": 829.9453125, "completions/mean_terminated_length": 843.1190795898438, "completions/min_length": 0.0, "completions/min_terminated_length": 450.0, "epoch": 0.011733333333333333, "grad_norm": 0.028363078832626343, "kl": 0.0006840825080871582, "learning_rate": 2.7500000000000004e-06, "loss": -0.0067, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.017736785113811493, "mask/share_reasoning": 0.8707066178321838, "mask/share_step_conf": 0.09593162685632706, "num_tokens": 3427792.0, "reward": 0.8359595537185669, "reward_std": 0.22266630828380585, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6278089284896851, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7253601551055908, "step": 11 }, { "adv/mean_abs_final_conf": 0.44267383217811584, "adv/mean_abs_reasoning": 0.43362054228782654, "adv/mean_abs_step_conf": 0.7366559505462646, "adv/ratio_final_to_reasoning": 1.0208783694668229, "adv/ratio_step_to_reasoning": 1.6988492903486356, "adv/std_final_conf": 0.7020049095153809, "adv/std_reasoning": 0.7015119194984436, "adv/std_step_conf": 0.9088842272758484, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5025559668605676, "calib/avg_num_step_conf": 8.26171875, "calib/ece": 0.23716599190283394, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.111933721113626e-05, "calib/mean_conf": 0.9902024291497975, "calib/mu_c": 0.9902150537634409, "calib/mu_w": 0.9901639344262297, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23716599190283394, "calib/std_conf": 0.0014083017919778221, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9120886889460155, "calib/step_q_c_n": 1556.0, "calib/step_q_gap": 0.004432159429020643, "calib/step_q_w": 0.9076565295169948, "calib/step_q_w_n": 559.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2495.0, "completions/max_terminated_length": 2495.0, "completions/mean_length": 760.328125, "completions/mean_terminated_length": 775.47412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.0128, "grad_norm": 0.022897174581885338, "kl": 0.0008499026298522949, "learning_rate": 3e-06, "loss": -0.0844, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019012587144970894, "mask/share_reasoning": 0.8476526141166687, "mask/share_step_conf": 0.11380352079868317, "num_tokens": 3726612.0, "reward": 0.9202487468719482, "reward_std": 0.21560242772102356, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7311554551124573, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7710606455802917, "step": 12 }, { "adv/mean_abs_final_conf": 0.5370169281959534, "adv/mean_abs_reasoning": 0.5062729120254517, "adv/mean_abs_step_conf": 0.7291202545166016, "adv/ratio_final_to_reasoning": 1.0607261724659607, "adv/ratio_step_to_reasoning": 1.4401723600016483, "adv/std_final_conf": 0.8014257550239563, "adv/std_reasoning": 0.7929041981697083, "adv/std_step_conf": 0.9328606724739075, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.49700598802395207, "calib/avg_num_step_conf": 8.265625, "calib/ece": 0.31101626016260164, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00017964071856246822, "calib/mean_conf": 0.9898780487804878, "calib/mu_c": 0.9898203592814371, "calib/mu_w": 0.9899999999999995, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31101626016260164, "calib/std_conf": 0.0019088385173778716, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9110812883435584, "calib/step_q_c_n": 1304.0, "calib/step_q_gap": 0.009049268639124808, "calib/step_q_w": 0.9020320197044336, "calib/step_q_w_n": 812.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2578.0, "completions/max_terminated_length": 2578.0, "completions/mean_length": 782.8828125, "completions/mean_terminated_length": 808.1370849609375, "completions/min_length": 0.0, "completions/min_terminated_length": 360.0, "epoch": 0.013866666666666666, "grad_norm": 3275.6826171875, "kl": 6432.037212371826, "learning_rate": 3.2500000000000002e-06, "loss": 74.9262, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01816052757203579, "mask/share_reasoning": 0.8450461626052856, "mask/share_step_conf": 0.1055433452129364, "num_tokens": 4031622.0, "reward": 0.8452422022819519, "reward_std": 0.2676650285720825, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6583359241485596, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7102735042572021, "step": 13 }, { "adv/mean_abs_final_conf": 0.39153605699539185, "adv/mean_abs_reasoning": 0.3841407597064972, "adv/mean_abs_step_conf": 0.7506622076034546, "adv/ratio_final_to_reasoning": 1.0192515298156464, "adv/ratio_step_to_reasoning": 1.954133188514018, "adv/std_final_conf": 0.6616212129592896, "adv/std_reasoning": 0.6613654494285583, "adv/std_step_conf": 0.9337601661682129, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5099337748344371, "calib/avg_num_step_conf": 7.6953125, "calib/ece": 0.36615289256198347, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001920529801324733, "calib/mean_conf": 0.9901198347107438, "calib/mu_c": 0.9901920529801324, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36615289256198347, "calib/std_conf": 0.001070885926181841, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9089094827586207, "calib/step_q_c_n": 1160.0, "calib/step_q_gap": 0.007625532141336766, "calib/step_q_w": 0.901283950617284, "calib/step_q_w_n": 810.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2683.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 808.359375, "completions/mean_terminated_length": 837.8137817382812, "completions/min_length": 0.0, "completions/min_terminated_length": 494.0, "epoch": 0.014933333333333333, "grad_norm": 0.015768852084875107, "kl": 0.0010614991188049316, "learning_rate": 3.5e-06, "loss": -0.1338, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.01759328506886959, "mask/share_reasoning": 0.8488487601280212, "mask/share_step_conf": 0.09840168058872223, "num_tokens": 4343962.0, "reward": 0.8007533550262451, "reward_std": 0.1906447410583496, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.5968597531318665, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.6976156234741211, "step": 14 }, { "adv/mean_abs_final_conf": 0.4488527774810791, "adv/mean_abs_reasoning": 0.4263514280319214, "adv/mean_abs_step_conf": 0.7604366540908813, "adv/ratio_final_to_reasoning": 1.0527765312128214, "adv/ratio_step_to_reasoning": 1.7835911975271879, "adv/std_final_conf": 0.7022488117218018, "adv/std_reasoning": 0.7015137076377869, "adv/std_step_conf": 0.9223707318305969, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5033783783783784, "calib/avg_num_step_conf": 7.6171875, "calib/ece": 0.38841463414634136, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 6.75675675674281e-05, "calib/mean_conf": 0.990040650406504, "calib/mu_c": 0.9900675675675674, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38841463414634136, "calib/std_conf": 0.0006362795057926237, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9109266609145816, "calib/step_q_c_n": 1159.0, "calib/step_q_gap": 0.0021655989676788723, "calib/step_q_w": 0.9087610619469028, "calib/step_q_w_n": 791.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2554.0, "completions/max_terminated_length": 2554.0, "completions/mean_length": 765.7421875, "completions/mean_terminated_length": 787.26904296875, "completions/min_length": 0.0, "completions/min_terminated_length": 438.0, "epoch": 0.016, "grad_norm": 0.021051140502095222, "kl": 0.0009889006614685059, "learning_rate": 3.7500000000000005e-06, "loss": -0.1004, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01849520020186901, "mask/share_reasoning": 0.8517429828643799, "mask/share_step_conf": 0.10241805016994476, "num_tokens": 4647872.0, "reward": 0.7852047681808472, "reward_std": 0.22741200029850006, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5856855511665344, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6761302947998047, "step": 15 }, { "adv/mean_abs_final_conf": 0.44075024127960205, "adv/mean_abs_reasoning": 0.4255930781364441, "adv/mean_abs_step_conf": 0.7281934022903442, "adv/ratio_final_to_reasoning": 1.0356142144264353, "adv/ratio_step_to_reasoning": 1.7110085659261762, "adv/std_final_conf": 0.7203139662742615, "adv/std_reasoning": 0.7205998301506042, "adv/std_step_conf": 0.9330427646636963, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.49390243902439024, "calib/avg_num_step_conf": 7.68359375, "calib/ece": 0.32336991869918696, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00010975609756103122, "calib/mean_conf": 0.9900365853658536, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9901097560975609, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32336991869918696, "calib/std_conf": 0.0005726515552133617, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9064437450826123, "calib/step_q_c_n": 1271.0, "calib/step_q_gap": 0.007529951979163951, "calib/step_q_w": 0.8989137931034483, "calib/step_q_w_n": 696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 916.58984375, "completions/mean_terminated_length": 931.138916015625, "completions/min_length": 0.0, "completions/min_terminated_length": 472.0, "epoch": 0.017066666666666667, "grad_norm": 0.024245938286185265, "kl": 0.0011619329452514648, "learning_rate": 4.000000000000001e-06, "loss": -0.0161, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.016265347599983215, "mask/share_reasoning": 0.8771912455558777, "mask/share_step_conf": 0.0909183919429779, "num_tokens": 4991367.0, "reward": 0.8399591445922852, "reward_std": 0.2094450294971466, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.64686518907547, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7127405405044556, "step": 16 }, { "adv/mean_abs_final_conf": 0.446701318025589, "adv/mean_abs_reasoning": 0.4303044080734253, "adv/mean_abs_step_conf": 0.7567791938781738, "adv/ratio_final_to_reasoning": 1.0381053729511547, "adv/ratio_step_to_reasoning": 1.7587065799917168, "adv/std_final_conf": 0.6828343272209167, "adv/std_reasoning": 0.6818340420722961, "adv/std_step_conf": 0.9191504120826721, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.921875, "calib/ece": 0.2509236947791166, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00046153846153840217, "calib/mean_conf": 0.9898795180722892, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9895384615384617, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2509236947791166, "calib/std_conf": 0.002610132820766576, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9065742024965325, "calib/step_q_c_n": 1442.0, "calib/step_q_gap": 0.006215840721788268, "calib/step_q_w": 0.9003583617747443, "calib/step_q_w_n": 586.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2965.0, "completions/max_terminated_length": 2965.0, "completions/mean_length": 813.078125, "completions/mean_terminated_length": 832.592041015625, "completions/min_length": 0.0, "completions/min_terminated_length": 469.0, "epoch": 0.018133333333333335, "grad_norm": 0.01629556715488434, "kl": 0.0016171932220458984, "learning_rate": 4.25e-06, "loss": -0.1074, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.017606137320399284, "mask/share_reasoning": 0.8571971654891968, "mask/share_step_conf": 0.1017591655254364, "num_tokens": 5303043.0, "reward": 0.9147970080375671, "reward_std": 0.23791097104549408, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7239562273025513, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7673563957214355, "step": 17 }, { "adv/mean_abs_final_conf": 0.38064736127853394, "adv/mean_abs_reasoning": 0.3477587401866913, "adv/mean_abs_step_conf": 0.7276158332824707, "adv/ratio_final_to_reasoning": 1.0945730970677736, "adv/ratio_step_to_reasoning": 2.0923006360439897, "adv/std_final_conf": 0.6817070841789246, "adv/std_reasoning": 0.6612668037414551, "adv/std_step_conf": 0.9329667687416077, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4954954954954955, "calib/avg_num_step_conf": 7.2421875, "calib/ece": 0.4376209677419356, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -9.00900900897561e-05, "calib/mean_conf": 0.9900403225806452, "calib/mu_c": 0.99, "calib/mu_w": 0.9900900900900897, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4376209677419356, "calib/std_conf": 0.0006337190986089406, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9095242718446602, "calib/step_q_c_n": 1030.0, "calib/step_q_gap": 0.009682038834951445, "calib/step_q_w": 0.8998422330097088, "calib/step_q_w_n": 824.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 813.5625, "completions/mean_terminated_length": 829.7689208984375, "completions/min_length": 0.0, "completions/min_terminated_length": 443.0, "epoch": 0.0192, "grad_norm": 0.022068968042731285, "kl": 0.0018869638442993164, "learning_rate": 4.5e-06, "loss": -0.0985, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01774272322654724, "mask/share_reasoning": 0.8674094080924988, "mask/share_step_conf": 0.09531664848327637, "num_tokens": 5622035.0, "reward": 0.7283351421356201, "reward_std": 0.20351342856884003, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5358418226242065, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6216096878051758, "step": 18 }, { "adv/mean_abs_final_conf": 0.4503156542778015, "adv/mean_abs_reasoning": 0.4281848669052124, "adv/mean_abs_step_conf": 0.7243156433105469, "adv/ratio_final_to_reasoning": 1.0516851226726989, "adv/ratio_step_to_reasoning": 1.6915956151035323, "adv/std_final_conf": 0.7379180788993835, "adv/std_reasoning": 0.7205255627632141, "adv/std_step_conf": 0.9321519732475281, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5071428571428571, "calib/avg_num_step_conf": 7.796875, "calib/ece": 0.4278273092369479, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001520314547837076, "calib/mean_conf": 0.9900763052208836, "calib/mu_c": 0.990142857142857, "calib/mu_w": 0.9899908256880733, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4278273092369479, "calib/std_conf": 0.001234624637367013, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9074858223062381, "calib/step_q_c_n": 1058.0, "calib/step_q_gap": 0.011288594161248722, "calib/step_q_w": 0.8961972281449894, "calib/step_q_w_n": 938.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2846.0, "completions/max_terminated_length": 2846.0, "completions/mean_length": 823.5078125, "completions/mean_terminated_length": 833.2727661132812, "completions/min_length": 0.0, "completions/min_terminated_length": 412.0, "epoch": 0.020266666666666665, "grad_norm": 0.01661011576652527, "kl": 0.0021848678588867188, "learning_rate": 4.75e-06, "loss": -0.061, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018002573400735855, "mask/share_reasoning": 0.8686628937721252, "mask/share_step_conf": 0.1016157791018486, "num_tokens": 5937613.0, "reward": 0.7750768661499023, "reward_std": 0.21773789823055267, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5513952970504761, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.6956332921981812, "step": 19 }, { "adv/mean_abs_final_conf": 0.4298800826072693, "adv/mean_abs_reasoning": 0.4013995826244354, "adv/mean_abs_step_conf": 0.7212347984313965, "adv/ratio_final_to_reasoning": 1.0709529885323306, "adv/ratio_step_to_reasoning": 1.7968000706822134, "adv/std_final_conf": 0.7196903228759766, "adv/std_reasoning": 0.7013648748397827, "adv/std_step_conf": 0.9234609603881836, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.44140625, "calib/ece": 0.3956533864541833, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.0017745098039217977, "calib/mean_conf": 0.9892788844621515, "calib/mu_c": 0.99, "calib/mu_w": 0.9882254901960782, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3956533864541833, "calib/std_conf": 0.011984465058551374, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9077379095163807, "calib/step_q_c_n": 1282.0, "calib/step_q_gap": 0.0017947923377685804, "calib/step_q_w": 0.9059431171786121, "calib/step_q_w_n": 879.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2765.0, "completions/max_terminated_length": 2765.0, "completions/mean_length": 816.8046875, "completions/mean_terminated_length": 823.2362060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 461.0, "epoch": 0.021333333333333333, "grad_norm": 0.015205039642751217, "kl": 0.003265380859375, "learning_rate": 5e-06, "loss": -0.0495, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01863781549036503, "mask/share_reasoning": 0.8605009317398071, "mask/share_step_conf": 0.1130487322807312, "num_tokens": 6251587.0, "reward": 0.7963709831237793, "reward_std": 0.21171826124191284, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5911605358123779, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6890814304351807, "step": 20 }, { "adv/mean_abs_final_conf": 0.5546455383300781, "adv/mean_abs_reasoning": 0.5302027463912964, "adv/mean_abs_step_conf": 0.7633264064788818, "adv/ratio_final_to_reasoning": 1.046100839924248, "adv/ratio_step_to_reasoning": 1.4396877641134231, "adv/std_final_conf": 0.7927553653717041, "adv/std_reasoning": 0.7929099798202515, "adv/std_step_conf": 0.9229240417480469, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.503030303030303, "calib/avg_num_step_conf": 8.16796875, "calib/ece": 0.3220242914979756, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 6.06060606060943e-05, "calib/mean_conf": 0.9900404858299594, "calib/mu_c": 0.990060606060606, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3220242914979756, "calib/std_conf": 0.0006349954308242161, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9071587537091987, "calib/step_q_c_n": 1348.0, "calib/step_q_gap": 0.005395631232751885, "calib/step_q_w": 0.9017631224764469, "calib/step_q_w_n": 743.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3032.0, "completions/max_terminated_length": 3032.0, "completions/mean_length": 809.734375, "completions/mean_terminated_length": 829.1680297851562, "completions/min_length": 0.0, "completions/min_terminated_length": 459.0, "epoch": 0.0224, "grad_norm": 0.011656345799565315, "kl": 0.004233837127685547, "learning_rate": 4.9722222222222224e-06, "loss": -0.1312, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017971325665712357, "mask/share_reasoning": 0.8535507917404175, "mask/share_step_conf": 0.10504040122032166, "num_tokens": 6561839.0, "reward": 0.8428006768226624, "reward_std": 0.28343454003334045, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6469355225563049, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7175719738006592, "step": 21 }, { "adv/mean_abs_final_conf": 0.4129585325717926, "adv/mean_abs_reasoning": 0.39449843764305115, "adv/mean_abs_step_conf": 0.724773108959198, "adv/ratio_final_to_reasoning": 1.0467938353293162, "adv/ratio_step_to_reasoning": 1.837201468501087, "adv/std_final_conf": 0.6997731924057007, "adv/std_reasoning": 0.6815393567085266, "adv/std_step_conf": 0.9256146550178528, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.49444444444444446, "calib/avg_num_step_conf": 8.41796875, "calib/ece": 0.34860557768924305, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00011111111111106187, "calib/mean_conf": 0.9900398406374502, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9901111111111109, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34860557768924305, "calib/std_conf": 0.0006299357888781636, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9074658573596358, "calib/step_q_c_n": 1318.0, "calib/step_q_gap": 0.00268688483872781, "calib/step_q_w": 0.904778972520908, "calib/step_q_w_n": 837.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2572.0, "completions/max_terminated_length": 2572.0, "completions/mean_length": 812.296875, "completions/mean_terminated_length": 825.1905517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 441.0, "epoch": 0.023466666666666667, "grad_norm": 0.016305429860949516, "kl": 0.0047855377197265625, "learning_rate": 4.944444444444445e-06, "loss": -0.0414, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.017927877604961395, "mask/share_reasoning": 0.8585070967674255, "mask/share_step_conf": 0.10794000327587128, "num_tokens": 6871603.0, "reward": 0.8370054364204407, "reward_std": 0.19273146986961365, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6357617378234863, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7163741588592529, "step": 22 }, { "adv/mean_abs_final_conf": 0.5349942445755005, "adv/mean_abs_reasoning": 0.5196344256401062, "adv/mean_abs_step_conf": 0.7811074256896973, "adv/ratio_final_to_reasoning": 1.0295588940560922, "adv/ratio_step_to_reasoning": 1.5031864463704427, "adv/std_final_conf": 0.7593243718147278, "adv/std_reasoning": 0.7576791644096375, "adv/std_step_conf": 0.9311987161636353, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5031847133757962, "calib/avg_num_step_conf": 8.15234375, "calib/ece": 0.36702380952380953, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 6.369426751606788e-05, "calib/mean_conf": 0.9900396825396826, "calib/mu_c": 0.9900636942675157, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36702380952380953, "calib/std_conf": 0.0006286896634029713, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.906015625, "calib/step_q_c_n": 1280.0, "calib/step_q_gap": -0.0004899512081784074, "calib/step_q_w": 0.9065055762081784, "calib/step_q_w_n": 807.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 824.48046875, "completions/mean_terminated_length": 834.2569580078125, "completions/min_length": 0.0, "completions/min_terminated_length": 494.0, "epoch": 0.024533333333333334, "grad_norm": 0.018598461523652077, "kl": 0.0061092376708984375, "learning_rate": 4.9166666666666665e-06, "loss": -0.0258, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01792784407734871, "mask/share_reasoning": 0.8628264665603638, "mask/share_step_conf": 0.10752691328525543, "num_tokens": 7186606.0, "reward": 0.8155762553215027, "reward_std": 0.2864585518836975, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6206049919128418, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.6910161972045898, "step": 23 }, { "adv/mean_abs_final_conf": 0.6040255427360535, "adv/mean_abs_reasoning": 0.5883716344833374, "adv/mean_abs_step_conf": 0.7876837253570557, "adv/ratio_final_to_reasoning": 1.0266054774487252, "adv/ratio_step_to_reasoning": 1.3387520390046315, "adv/std_final_conf": 0.8085963726043701, "adv/std_reasoning": 0.8101164102554321, "adv/std_step_conf": 0.9350883364677429, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.5038461538461538, "calib/avg_num_step_conf": 8.375, "calib/ece": 0.43210300429184556, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 7.692307692352962e-05, "calib/mean_conf": 0.9900429184549356, "calib/mu_c": 0.9900769230769232, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.43210300429184556, "calib/std_conf": 0.0006537144296878898, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9081593927893739, "calib/step_q_c_n": 1054.0, "calib/step_q_gap": 0.0001777414132270172, "calib/step_q_w": 0.9079816513761468, "calib/step_q_w_n": 1090.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2672.0, "completions/max_terminated_length": 2672.0, "completions/mean_length": 863.80859375, "completions/mean_terminated_length": 917.5726928710938, "completions/min_length": 0.0, "completions/min_terminated_length": 492.0, "epoch": 0.0256, "grad_norm": 0.021909039467573166, "kl": 0.0059413909912109375, "learning_rate": 4.888888888888889e-06, "loss": -0.3015, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.015839194878935814, "mask/share_reasoning": 0.831562340259552, "mask/share_step_conf": 0.09400475025177002, "num_tokens": 7512253.0, "reward": 0.7043561935424805, "reward_std": 0.2977864742279053, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5118628740310669, "rewards/format_reward_step": 0.90625, "rewards/step_l2_reward": 0.6140369176864624, "step": 24 }, { "adv/mean_abs_final_conf": 0.3722795844078064, "adv/mean_abs_reasoning": 0.3595278859138489, "adv/mean_abs_step_conf": 0.7372299432754517, "adv/ratio_final_to_reasoning": 1.0354678983009766, "adv/ratio_step_to_reasoning": 2.050550102397645, "adv/std_final_conf": 0.6617670655250549, "adv/std_reasoning": 0.6612728238105774, "adv/std_step_conf": 0.9230087399482727, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.50625, "calib/avg_num_step_conf": 7.9375, "calib/ece": 0.35007600000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00011875000000005631, "calib/mean_conf": 0.9900760000000001, "calib/mu_c": 0.99011875, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35007600000000005, "calib/std_conf": 0.0008474809732377484, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.90592277992278, "calib/step_q_c_n": 1295.0, "calib/step_q_gap": 0.0039960499363485, "calib/step_q_w": 0.9019267299864315, "calib/step_q_w_n": 737.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2920.0, "completions/max_terminated_length": 2920.0, "completions/mean_length": 801.48046875, "completions/mean_terminated_length": 804.6235961914062, "completions/min_length": 0.0, "completions/min_terminated_length": 437.0, "epoch": 0.02666666666666667, "grad_norm": 0.0172025877982378, "kl": 0.008014678955078125, "learning_rate": 4.861111111111111e-06, "loss": -0.0154, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018549812957644463, "mask/share_reasoning": 0.8690973520278931, "mask/share_step_conf": 0.10844656825065613, "num_tokens": 7820656.0, "reward": 0.823807954788208, "reward_std": 0.194789856672287, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.631934404373169, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.6953690052032471, "step": 25 }, { "adv/mean_abs_final_conf": 0.44211018085479736, "adv/mean_abs_reasoning": 0.41643744707107544, "adv/mean_abs_step_conf": 0.7278153300285339, "adv/ratio_final_to_reasoning": 1.0616484755736681, "adv/ratio_step_to_reasoning": 1.747718259122634, "adv/std_final_conf": 0.7366110682487488, "adv/std_reasoning": 0.7205003499984741, "adv/std_step_conf": 0.9267618656158447, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4888888888888889, "calib/avg_num_step_conf": 8.203125, "calib/ece": 0.35008000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00022222222222212373, "calib/mean_conf": 0.9900800000000001, "calib/mu_c": 0.99, "calib/mu_w": 0.9902222222222221, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35008000000000006, "calib/std_conf": 0.0008908422980528043, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9057789716039908, "calib/step_q_c_n": 1303.0, "calib/step_q_gap": 0.0012243919302140638, "calib/step_q_w": 0.9045545796737767, "calib/step_q_w_n": 797.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2958.0, "completions/max_terminated_length": 2958.0, "completions/mean_length": 834.125, "completions/mean_terminated_length": 840.6929321289062, "completions/min_length": 0.0, "completions/min_terminated_length": 494.0, "epoch": 0.027733333333333332, "grad_norm": 0.018005995079874992, "kl": 0.00823354721069336, "learning_rate": 4.833333333333333e-06, "loss": -0.0557, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01745178923010826, "mask/share_reasoning": 0.8718188405036926, "mask/share_step_conf": 0.10291685163974762, "num_tokens": 8139432.0, "reward": 0.8200647830963135, "reward_std": 0.19963213801383972, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6317781209945679, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.6880388855934143, "step": 26 }, { "adv/mean_abs_final_conf": 0.4970061182975769, "adv/mean_abs_reasoning": 0.46902090311050415, "adv/mean_abs_step_conf": 0.8002022504806519, "adv/ratio_final_to_reasoning": 1.0596673090718929, "adv/ratio_step_to_reasoning": 1.7061121267171313, "adv/std_final_conf": 0.7211012244224548, "adv/std_reasoning": 0.7208249568939209, "adv/std_step_conf": 0.9283366799354553, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.91015625, "calib/ece": 0.4511836734693877, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 8.849557522139229e-05, "calib/mean_conf": 0.9899591836734694, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9899115044247787, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4511836734693877, "calib/std_conf": 0.0014279882191457894, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9062722419928826, "calib/step_q_c_n": 1124.0, "calib/step_q_gap": -0.0009619844548270917, "calib/step_q_w": 0.9072342264477097, "calib/step_q_w_n": 1157.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3013.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 828.1015625, "completions/mean_terminated_length": 858.2753295898438, "completions/min_length": 0.0, "completions/min_terminated_length": 488.0, "epoch": 0.0288, "grad_norm": 0.017703456804156303, "kl": 0.0074367523193359375, "learning_rate": 4.805555555555556e-06, "loss": -0.1344, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.016773667186498642, "mask/share_reasoning": 0.847740888595581, "mask/share_step_conf": 0.10032917559146881, "num_tokens": 8456642.0, "reward": 0.7276268005371094, "reward_std": 0.2555812895298004, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5244327783584595, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6362894773483276, "step": 27 }, { "adv/mean_abs_final_conf": 0.4407339096069336, "adv/mean_abs_reasoning": 0.4065035879611969, "adv/mean_abs_step_conf": 0.7146235704421997, "adv/ratio_final_to_reasoning": 1.0842066900747853, "adv/ratio_step_to_reasoning": 1.75797604647567, "adv/std_final_conf": 0.7218397855758667, "adv/std_reasoning": 0.7207350134849548, "adv/std_step_conf": 0.9135732650756836, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.890625, "calib/ece": 0.2768852459016393, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.220446049250313e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2768852459016393, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9043002915451895, "calib/step_q_c_n": 1372.0, "calib/step_q_gap": -0.004866375121477251, "calib/step_q_w": 0.9091666666666668, "calib/step_q_w_n": 648.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2613.0, "completions/max_terminated_length": 2613.0, "completions/mean_length": 850.53515625, "completions/mean_terminated_length": 881.5263671875, "completions/min_length": 0.0, "completions/min_terminated_length": 468.0, "epoch": 0.029866666666666666, "grad_norm": 0.015081276185810566, "kl": 0.0077877044677734375, "learning_rate": 4.777777777777778e-06, "loss": -0.1394, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.016542209312319756, "mask/share_reasoning": 0.8544609546661377, "mask/share_step_conf": 0.09384053945541382, "num_tokens": 8781323.0, "reward": 0.8726210594177246, "reward_std": 0.2400265485048294, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6850608587265015, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7336186170578003, "step": 28 }, { "adv/mean_abs_final_conf": 0.48541906476020813, "adv/mean_abs_reasoning": 0.47797638177871704, "adv/mean_abs_step_conf": 0.7424254417419434, "adv/ratio_final_to_reasoning": 1.0155712358710995, "adv/ratio_step_to_reasoning": 1.5532680484736903, "adv/std_final_conf": 0.7575644850730896, "adv/std_reasoning": 0.757592499256134, "adv/std_step_conf": 0.9264086484909058, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5134751773049645, "calib/avg_num_step_conf": 7.828125, "calib/ece": 0.4047261410788382, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006638297872341159, "calib/mean_conf": 0.9897883817427386, "calib/mu_c": 0.990063829787234, "calib/mu_w": 0.9893999999999998, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4047261410788382, "calib/std_conf": 0.002785713008604025, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9064064748201439, "calib/step_q_c_n": 1112.0, "calib/step_q_gap": -0.0006450946865825813, "calib/step_q_w": 0.9070515695067265, "calib/step_q_w_n": 892.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2696.0, "completions/max_terminated_length": 2696.0, "completions/mean_length": 887.25, "completions/mean_terminated_length": 927.085693359375, "completions/min_length": 0.0, "completions/min_terminated_length": 361.0, "epoch": 0.030933333333333334, "grad_norm": 0.020906547084450722, "kl": 0.0071125030517578125, "learning_rate": 4.75e-06, "loss": -0.1638, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.015757828950881958, "mask/share_reasoning": 0.8528798222541809, "mask/share_step_conf": 0.08839359879493713, "num_tokens": 9115587.0, "reward": 0.7560781836509705, "reward_std": 0.2232230305671692, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5589569807052612, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.654761791229248, "step": 29 }, { "adv/mean_abs_final_conf": 0.6214361190795898, "adv/mean_abs_reasoning": 0.5938949584960938, "adv/mean_abs_step_conf": 0.7599336504936218, "adv/ratio_final_to_reasoning": 1.0463737908353994, "adv/ratio_step_to_reasoning": 1.2795758570134759, "adv/std_final_conf": 0.8352395296096802, "adv/std_reasoning": 0.8266966938972473, "adv/std_step_conf": 0.9322589039802551, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.49640287769784175, "calib/avg_num_step_conf": 8.03125, "calib/ece": 0.41549586776859515, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002158273381293041, "calib/mean_conf": 0.9898760330578513, "calib/mu_c": 0.9897841726618704, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.41549586776859515, "calib/std_conf": 0.0019244844664785174, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9041459074733096, "calib/step_q_c_n": 1124.0, "calib/step_q_gap": 0.0051920448123654594, "calib/step_q_w": 0.8989538626609441, "calib/step_q_w_n": 932.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 853.328125, "completions/mean_terminated_length": 877.3172607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 518.0, "epoch": 0.032, "grad_norm": 0.02036236599087715, "kl": 0.008612632751464844, "learning_rate": 4.722222222222222e-06, "loss": -0.1238, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.016304276883602142, "mask/share_reasoning": 0.8609112501144409, "mask/share_step_conf": 0.09544073045253754, "num_tokens": 9441023.0, "reward": 0.7334448099136353, "reward_std": 0.3001938462257385, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.550837516784668, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.6191771030426025, "step": 30 }, { "adv/mean_abs_final_conf": 0.4535819888114929, "adv/mean_abs_reasoning": 0.4273011088371277, "adv/mean_abs_step_conf": 0.7394912838935852, "adv/ratio_final_to_reasoning": 1.0615043570700926, "adv/ratio_step_to_reasoning": 1.730609325835973, "adv/std_final_conf": 0.718247652053833, "adv/std_reasoning": 0.7207338213920593, "adv/std_step_conf": 0.9325244426727295, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.49167305414256846, "calib/avg_num_step_conf": 8.5078125, "calib/ece": 0.5190791666666668, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0003363528673956395, "calib/mean_conf": 0.9899125000000001, "calib/mu_c": 0.9897345132743365, "calib/mu_w": 0.9900708661417321, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5190791666666668, "calib/std_conf": 0.0020198623096637073, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9051240560949299, "calib/step_q_c_n": 927.0, "calib/step_q_gap": 0.013545319084538154, "calib/step_q_w": 0.8915787370103917, "calib/step_q_w_n": 1251.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 950.38671875, "completions/mean_terminated_length": 989.020263671875, "completions/min_length": 0.0, "completions/min_terminated_length": 511.0, "epoch": 0.03306666666666667, "grad_norm": 0.014488758519291878, "kl": 0.007906913757324219, "learning_rate": 4.694444444444445e-06, "loss": -0.1558, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.01523461751639843, "mask/share_reasoning": 0.8540059924125671, "mask/share_step_conf": 0.09169687330722809, "num_tokens": 9790234.0, "reward": 0.6493246555328369, "reward_std": 0.2197844237089157, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.4511585831642151, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.5717094540596008, "step": 31 }, { "adv/mean_abs_final_conf": 0.4951266944408417, "adv/mean_abs_reasoning": 0.46537792682647705, "adv/mean_abs_step_conf": 0.7736719250679016, "adv/ratio_final_to_reasoning": 1.0639238904544281, "adv/ratio_step_to_reasoning": 1.6624594345154158, "adv/std_final_conf": 0.7361029982566833, "adv/std_reasoning": 0.7206937670707703, "adv/std_step_conf": 0.9204629063606262, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.49647139431312093, "calib/avg_num_step_conf": 8.72265625, "calib/ece": 0.41995491803278684, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9959016393442623, "calib/gap": 0.0006994861253852847, "calib/mean_conf": 0.9896270491803278, "calib/mu_c": 0.9899280575539567, "calib/mu_w": 0.9892285714285715, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.41995491803278684, "calib/std_conf": 0.005813727915569303, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9043189964157706, "calib/step_q_c_n": 1116.0, "calib/step_q_gap": -0.007287986574381566, "calib/step_q_w": 0.9116069829901522, "calib/step_q_w_n": 1117.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2998.0, "completions/max_terminated_length": 2998.0, "completions/mean_length": 839.625, "completions/mean_terminated_length": 870.2186279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 453.0, "epoch": 0.034133333333333335, "grad_norm": 0.017103182151913643, "kl": 0.010321617126464844, "learning_rate": 4.666666666666667e-06, "loss": -0.1508, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01689147762954235, "mask/share_reasoning": 0.8503791093826294, "mask/share_step_conf": 0.0975731611251831, "num_tokens": 10111882.0, "reward": 0.7398710250854492, "reward_std": 0.22319459915161133, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5509277582168579, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6303766965866089, "step": 32 }, { "adv/mean_abs_final_conf": 0.41008493304252625, "adv/mean_abs_reasoning": 0.4098625183105469, "adv/mean_abs_step_conf": 0.741827130317688, "adv/ratio_final_to_reasoning": 1.0005426569204137, "adv/ratio_step_to_reasoning": 1.8099413758923335, "adv/std_final_conf": 0.7017787098884583, "adv/std_reasoning": 0.7012677788734436, "adv/std_step_conf": 0.9291035532951355, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.49137931034482757, "calib/avg_num_step_conf": 8.2578125, "calib/ece": 0.4467637795275591, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001551724137930366, "calib/mean_conf": 0.9900708661417323, "calib/mu_c": 0.99, "calib/mu_w": 0.990155172413793, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4467637795275591, "calib/std_conf": 0.000795470468054303, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9031728288907998, "calib/step_q_c_n": 1163.0, "calib/step_q_gap": 0.01485316537870729, "calib/step_q_w": 0.8883196635120925, "calib/step_q_w_n": 951.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2178.0, "completions/max_terminated_length": 2178.0, "completions/mean_length": 828.76171875, "completions/mean_terminated_length": 835.2874145507812, "completions/min_length": 0.0, "completions/min_terminated_length": 378.0, "epoch": 0.0352, "grad_norm": 0.022355087101459503, "kl": 0.011810302734375, "learning_rate": 4.638888888888889e-06, "loss": -0.0232, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.017790237441658974, "mask/share_reasoning": 0.8689032793045044, "mask/share_step_conf": 0.1054939404129982, "num_tokens": 10430917.0, "reward": 0.759544849395752, "reward_std": 0.20857521891593933, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5478081703186035, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.6658127307891846, "step": 33 }, { "adv/mean_abs_final_conf": 0.5660641193389893, "adv/mean_abs_reasoning": 0.524277925491333, "adv/mean_abs_step_conf": 0.7607817649841309, "adv/ratio_final_to_reasoning": 1.0797023712346765, "adv/ratio_step_to_reasoning": 1.4511039431445747, "adv/std_final_conf": 0.7932949066162109, "adv/std_reasoning": 0.7755244374275208, "adv/std_step_conf": 0.9272559881210327, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5062840803298311, "calib/avg_num_step_conf": 8.11328125, "calib/ece": 0.40353413654618486, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00012568160659687244, "calib/mean_conf": 0.9898795180722892, "calib/mu_c": 0.9899315068493151, "calib/mu_w": 0.9898058252427182, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40353413654618486, "calib/std_conf": 0.0010910102576069185, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9026694915254238, "calib/step_q_c_n": 1180.0, "calib/step_q_gap": 0.008622668782948817, "calib/step_q_w": 0.8940468227424749, "calib/step_q_w_n": 897.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 801.87890625, "completions/mean_terminated_length": 817.8526000976562, "completions/min_length": 0.0, "completions/min_terminated_length": 525.0, "epoch": 0.03626666666666667, "grad_norm": 0.017718534916639328, "kl": 0.012401580810546875, "learning_rate": 4.611111111111112e-06, "loss": -0.0627, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01772000640630722, "mask/share_reasoning": 0.8578258752822876, "mask/share_step_conf": 0.10492290556430817, "num_tokens": 10741310.0, "reward": 0.8029850721359253, "reward_std": 0.2710467576980591, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5745089054107666, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7236486673355103, "step": 34 }, { "adv/mean_abs_final_conf": 0.5090737342834473, "adv/mean_abs_reasoning": 0.48734915256500244, "adv/mean_abs_step_conf": 0.7572380900382996, "adv/ratio_final_to_reasoning": 1.0445770380518866, "adv/ratio_step_to_reasoning": 1.5537896927753443, "adv/std_final_conf": 0.7659239172935486, "adv/std_reasoning": 0.7575402855873108, "adv/std_step_conf": 0.9266098141670227, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.501396462295518, "calib/avg_num_step_conf": 7.76171875, "calib/ece": 0.4034939759036146, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00010839207341373225, "calib/mean_conf": 0.989839357429719, "calib/mu_c": 0.9897945205479451, "calib/mu_w": 0.9899029126213589, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4034939759036146, "calib/std_conf": 0.001997563070754361, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8986625874125875, "calib/step_q_c_n": 1144.0, "calib/step_q_gap": 0.011473975312943585, "calib/step_q_w": 0.887188612099644, "calib/step_q_w_n": 843.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2988.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 904.2890625, "completions/mean_terminated_length": 922.3027954101562, "completions/min_length": 0.0, "completions/min_terminated_length": 496.0, "epoch": 0.037333333333333336, "grad_norm": 0.03180506452918053, "kl": 0.010568618774414062, "learning_rate": 4.583333333333333e-06, "loss": -0.1109, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01625356636941433, "mask/share_reasoning": 0.8752022385597229, "mask/share_step_conf": 0.08901292085647583, "num_tokens": 11082064.0, "reward": 0.7653804421424866, "reward_std": 0.23321697115898132, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5783331990242004, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6438338756561279, "step": 35 }, { "adv/mean_abs_final_conf": 0.38228654861450195, "adv/mean_abs_reasoning": 0.3659414052963257, "adv/mean_abs_step_conf": 0.7873794436454773, "adv/ratio_final_to_reasoning": 1.0446660123222202, "adv/ratio_step_to_reasoning": 2.151654424040611, "adv/std_final_conf": 0.6414028406143188, "adv/std_reasoning": 0.6404587030410767, "adv/std_step_conf": 0.9225838780403137, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.484375, "calib/ece": 0.24592000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003124999999997158, "calib/mean_conf": 0.98992, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9896875000000002, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24592000000000003, "calib/std_conf": 0.0019983993594874894, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9021782799748902, "calib/step_q_c_n": 1593.0, "calib/step_q_gap": -0.003296676847216906, "calib/step_q_w": 0.9054749568221071, "calib/step_q_w_n": 579.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2509.0, "completions/max_terminated_length": 2509.0, "completions/mean_length": 794.8125, "completions/mean_terminated_length": 807.4285888671875, "completions/min_length": 0.0, "completions/min_terminated_length": 388.0, "epoch": 0.0384, "grad_norm": 0.019437743350863457, "kl": 0.01312255859375, "learning_rate": 4.555555555555556e-06, "loss": -0.0454, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01833474636077881, "mask/share_reasoning": 0.853883683681488, "mask/share_step_conf": 0.11215655505657196, "num_tokens": 11388248.0, "reward": 0.9233396053314209, "reward_std": 0.20475012063980103, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7316156029701233, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7744386196136475, "step": 36 }, { "adv/mean_abs_final_conf": 0.42273592948913574, "adv/mean_abs_reasoning": 0.3929756283760071, "adv/mean_abs_step_conf": 0.7315104007720947, "adv/ratio_final_to_reasoning": 1.0757306534151105, "adv/ratio_step_to_reasoning": 1.8614650577571459, "adv/std_final_conf": 0.6996673941612244, "adv/std_reasoning": 0.6817983388900757, "adv/std_step_conf": 0.9243629574775696, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.83984375, "calib/ece": 0.517337552742616, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00016800000000016801, "calib/mean_conf": 0.9899113924050632, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9898319999999999, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.517337552742616, "calib/std_conf": 0.0020325819049791453, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9004334038054967, "calib/step_q_c_n": 946.0, "calib/step_q_gap": 0.003611459993803412, "calib/step_q_w": 0.8968219438116933, "calib/step_q_w_n": 1317.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2960.0, "completions/max_terminated_length": 2960.0, "completions/mean_length": 845.92578125, "completions/mean_terminated_length": 894.8635864257812, "completions/min_length": 0.0, "completions/min_terminated_length": 517.0, "epoch": 0.039466666666666664, "grad_norm": 0.017507895827293396, "kl": 0.010918617248535156, "learning_rate": 4.527777777777778e-06, "loss": -0.1952, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.01621920056641102, "mask/share_reasoning": 0.8316308856010437, "mask/share_step_conf": 0.09746243804693222, "num_tokens": 11711901.0, "reward": 0.6471365094184875, "reward_std": 0.20595934987068176, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.44733160734176636, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.5735039710998535, "step": 37 }, { "adv/mean_abs_final_conf": 0.4951311945915222, "adv/mean_abs_reasoning": 0.46970370411872864, "adv/mean_abs_step_conf": 0.7426836490631104, "adv/ratio_final_to_reasoning": 1.054135171278884, "adv/ratio_step_to_reasoning": 1.581174775822887, "adv/std_final_conf": 0.7573981285095215, "adv/std_reasoning": 0.7577217817306519, "adv/std_step_conf": 0.9252141714096069, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.953125, "calib/ece": 0.4544351464435147, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.3306690738754696e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4544351464435147, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9032015065913371, "calib/step_q_c_n": 1062.0, "calib/step_q_gap": 0.008704586673472625, "calib/step_q_w": 0.8944969199178645, "calib/step_q_w_n": 974.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2911.0, "completions/max_terminated_length": 2911.0, "completions/mean_length": 852.28515625, "completions/mean_terminated_length": 890.5509643554688, "completions/min_length": 0.0, "completions/min_terminated_length": 507.0, "epoch": 0.04053333333333333, "grad_norm": 0.023108594119548798, "kl": 0.011475563049316406, "learning_rate": 4.5e-06, "loss": -0.116, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.016291677951812744, "mask/share_reasoning": 0.8450714349746704, "mask/share_step_conf": 0.09566812217235565, "num_tokens": 12036974.0, "reward": 0.6965200901031494, "reward_std": 0.24555054306983948, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5085784792900085, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.5977429747581482, "step": 38 }, { "adv/mean_abs_final_conf": 0.4412452280521393, "adv/mean_abs_reasoning": 0.42349773645401, "adv/mean_abs_step_conf": 0.745875895023346, "adv/ratio_final_to_reasoning": 1.0419069337813487, "adv/ratio_step_to_reasoning": 1.7612275835725624, "adv/std_final_conf": 0.6999874711036682, "adv/std_reasoning": 0.7014147639274597, "adv/std_step_conf": 0.9331080317497253, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.4903186768858411, "calib/avg_num_step_conf": 8.2265625, "calib/ece": 0.44319591836734695, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001325803415352933, "calib/mean_conf": 0.990134693877551, "calib/mu_c": 0.9900746268656716, "calib/mu_w": 0.9902072072072069, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44319591836734695, "calib/std_conf": 0.001073911602870015, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.902592429577465, "calib/step_q_c_n": 1136.0, "calib/step_q_gap": 0.008604800711485505, "calib/step_q_w": 0.8939876288659795, "calib/step_q_w_n": 970.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3003.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 890.3828125, "completions/mean_terminated_length": 904.5159301757812, "completions/min_length": 0.0, "completions/min_terminated_length": 472.0, "epoch": 0.0416, "grad_norm": 0.019972793757915497, "kl": 0.013368606567382812, "learning_rate": 4.472222222222223e-06, "loss": -0.1042, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.016698993742465973, "mask/share_reasoning": 0.8672369122505188, "mask/share_step_conf": 0.10043909400701523, "num_tokens": 12371000.0, "reward": 0.735222578048706, "reward_std": 0.21249210834503174, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5318354368209839, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6425158977508545, "step": 39 }, { "adv/mean_abs_final_conf": 0.43904614448547363, "adv/mean_abs_reasoning": 0.4147336483001709, "adv/mean_abs_step_conf": 0.7419914603233337, "adv/ratio_final_to_reasoning": 1.0586219523902871, "adv/ratio_step_to_reasoning": 1.7890794811669202, "adv/std_final_conf": 0.7193395495414734, "adv/std_reasoning": 0.7012978792190552, "adv/std_step_conf": 0.9300381541252136, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.49990397541770687, "calib/avg_num_step_conf": 8.4140625, "calib/ece": 0.48207600000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.569425772788961e-06, "calib/mean_conf": 0.9900760000000001, "calib/mu_c": 0.9900787401574801, "calib/mu_w": 0.9900731707317073, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.48207600000000006, "calib/std_conf": 0.0008474809732377484, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9001939864209505, "calib/step_q_c_n": 1031.0, "calib/step_q_gap": -0.005035755342184034, "calib/step_q_w": 0.9052297417631345, "calib/step_q_w_n": 1123.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2676.0, "completions/max_terminated_length": 2676.0, "completions/mean_length": 884.18359375, "completions/mean_terminated_length": 898.2183227539062, "completions/min_length": 0.0, "completions/min_terminated_length": 478.0, "epoch": 0.042666666666666665, "grad_norm": 0.021310077980160713, "kl": 0.014337539672851562, "learning_rate": 4.444444444444444e-06, "loss": -0.0803, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.016493458300828934, "mask/share_reasoning": 0.8709720969200134, "mask/share_step_conf": 0.09690943360328674, "num_tokens": 12704111.0, "reward": 0.6947786808013916, "reward_std": 0.2072494924068451, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5055359601974487, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.5894901156425476, "step": 40 }, { "adv/mean_abs_final_conf": 0.43155983090400696, "adv/mean_abs_reasoning": 0.4150729179382324, "adv/mean_abs_step_conf": 0.7384018898010254, "adv/ratio_final_to_reasoning": 1.0397205219932657, "adv/ratio_step_to_reasoning": 1.778969086850682, "adv/std_final_conf": 0.712813675403595, "adv/std_reasoning": 0.7012646794319153, "adv/std_step_conf": 0.9279786944389343, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4975369458128079, "calib/avg_num_step_conf": 8.4375, "calib/ece": 0.19380392156862758, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00014778325123177005, "calib/mean_conf": 0.9898823529411765, "calib/mu_c": 0.9898522167487684, "calib/mu_w": 0.9900000000000002, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19380392156862758, "calib/std_conf": 0.0018749855824128521, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9017281780902168, "calib/step_q_c_n": 1707.0, "calib/step_q_gap": -0.0025985327265601876, "calib/step_q_w": 0.904326710816777, "calib/step_q_w_n": 453.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 798.296875, "completions/mean_terminated_length": 801.427490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 445.0, "epoch": 0.04373333333333333, "grad_norm": 0.015647921711206436, "kl": 0.014879226684570312, "learning_rate": 4.416666666666667e-06, "loss": -0.0233, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.018516331911087036, "mask/share_reasoning": 0.8660091161727905, "mask/share_step_conf": 0.11156836152076721, "num_tokens": 13015723.0, "reward": 0.9741901159286499, "reward_std": 0.2122625857591629, "rewards/accuracy_reward_step": 0.79296875, "rewards/final_brier_reward_step": 0.7969257831573486, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7936419248580933, "step": 41 }, { "adv/mean_abs_final_conf": 0.37356019020080566, "adv/mean_abs_reasoning": 0.35552409291267395, "adv/mean_abs_step_conf": 0.7282037734985352, "adv/ratio_final_to_reasoning": 1.0507310127433807, "adv/ratio_step_to_reasoning": 2.0482543602956356, "adv/std_final_conf": 0.6794334650039673, "adv/std_reasoning": 0.6613250374794006, "adv/std_step_conf": 0.9256712198257446, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.49843260188087773, "calib/avg_num_step_conf": 8.3984375, "calib/ece": 0.3958151639344262, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.2633228840107655e-05, "calib/mean_conf": 0.9900774590163934, "calib/mu_c": 0.9900682758620689, "calib/mu_w": 0.990090909090909, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3958151639344262, "calib/std_conf": 0.0008530226502051216, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8990497147514263, "calib/step_q_c_n": 1227.0, "calib/step_q_gap": -0.005915615692777321, "calib/step_q_w": 0.9049653304442036, "calib/step_q_w_n": 923.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2788.0, "completions/max_terminated_length": 2788.0, "completions/mean_length": 804.41796875, "completions/mean_terminated_length": 827.0321044921875, "completions/min_length": 0.0, "completions/min_terminated_length": 476.0, "epoch": 0.0448, "grad_norm": 0.017200641334056854, "kl": 0.01670074462890625, "learning_rate": 4.388888888888889e-06, "loss": -0.094, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01786448247730732, "mask/share_reasoning": 0.847210168838501, "mask/share_step_conf": 0.10758156329393387, "num_tokens": 13326022.0, "reward": 0.7614730596542358, "reward_std": 0.188564732670784, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5739758014678955, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.645064115524292, "step": 42 }, { "adv/mean_abs_final_conf": 0.48687943816185, "adv/mean_abs_reasoning": 0.4713999032974243, "adv/mean_abs_step_conf": 0.7664150595664978, "adv/ratio_final_to_reasoning": 1.0328373738648373, "adv/ratio_step_to_reasoning": 1.6258277827497498, "adv/std_final_conf": 0.7410199642181396, "adv/std_reasoning": 0.7394036054611206, "adv/std_step_conf": 0.930462121963501, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5029940119760479, "calib/avg_num_step_conf": 8.07421875, "calib/ece": 0.3246932270916336, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 4.7904191616821024e-05, "calib/mean_conf": 0.9900318725099603, "calib/mu_c": 0.9900479041916166, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3246932270916336, "calib/std_conf": 0.0010211801233396883, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8998834178131789, "calib/step_q_c_n": 1381.0, "calib/step_q_gap": -0.000626786268453694, "calib/step_q_w": 0.9005102040816326, "calib/step_q_w_n": 686.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2775.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 845.00390625, "completions/mean_terminated_length": 851.657470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 491.0, "epoch": 0.04586666666666667, "grad_norm": 0.019190490245819092, "kl": 0.016111373901367188, "learning_rate": 4.361111111111112e-06, "loss": -0.0029, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.017595965415239334, "mask/share_reasoning": 0.8708884716033936, "mask/share_step_conf": 0.10370305180549622, "num_tokens": 13647567.0, "reward": 0.8473519086837769, "reward_std": 0.25601252913475037, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6588077545166016, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7093334197998047, "step": 43 }, { "adv/mean_abs_final_conf": 0.5013235211372375, "adv/mean_abs_reasoning": 0.4585108757019043, "adv/mean_abs_step_conf": 0.7387799024581909, "adv/ratio_final_to_reasoning": 1.0933732386822759, "adv/ratio_step_to_reasoning": 1.6112592778246342, "adv/std_final_conf": 0.7603207230567932, "adv/std_reasoning": 0.739250659942627, "adv/std_step_conf": 0.9254706501960754, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5048664122137405, "calib/avg_num_step_conf": 7.95703125, "calib/ece": 0.4678884462151396, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 9.732824427455267e-05, "calib/mean_conf": 0.9898007968127491, "calib/mu_c": 0.9898473282442747, "calib/mu_w": 0.9897500000000001, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4678884462151396, "calib/std_conf": 0.0013972651726494184, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9010283018867925, "calib/step_q_c_n": 1060.0, "calib/step_q_gap": 0.004262692879627772, "calib/step_q_w": 0.8967656090071647, "calib/step_q_w_n": 977.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 854.3046875, "completions/mean_terminated_length": 861.031494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 498.0, "epoch": 0.046933333333333334, "grad_norm": 0.024990690872073174, "kl": 0.018238067626953125, "learning_rate": 4.333333333333334e-06, "loss": -0.037, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01713503897190094, "mask/share_reasoning": 0.8744046688079834, "mask/share_step_conf": 0.10064785182476044, "num_tokens": 13972589.0, "reward": 0.7188483476638794, "reward_std": 0.22061403095722198, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5212242007255554, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6180349588394165, "step": 44 }, { "adv/mean_abs_final_conf": 0.4545263648033142, "adv/mean_abs_reasoning": 0.4324095547199249, "adv/mean_abs_step_conf": 0.746199369430542, "adv/ratio_final_to_reasoning": 1.0511478292789218, "adv/ratio_step_to_reasoning": 1.7256773382675625, "adv/std_final_conf": 0.7177832722663879, "adv/std_reasoning": 0.7207047939300537, "adv/std_step_conf": 0.9280015230178833, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4965034965034965, "calib/avg_num_step_conf": 9.17578125, "calib/ece": 0.40356557377049174, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9959016393442623, "calib/gap": -0.0006293706293705181, "calib/mean_conf": 0.9896311475409836, "calib/mu_c": 0.9893706293706293, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.40356557377049174, "calib/std_conf": 0.005749840795617992, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8993268450932683, "calib/step_q_c_n": 1233.0, "calib/step_q_gap": -0.011890000784868016, "calib/step_q_w": 0.9112168458781363, "calib/step_q_w_n": 1116.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 866.65234375, "completions/mean_terminated_length": 894.6088256835938, "completions/min_length": 0.0, "completions/min_terminated_length": 483.0, "epoch": 0.048, "grad_norm": 0.024536319077014923, "kl": 0.018016815185546875, "learning_rate": 4.305555555555556e-06, "loss": -0.1097, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.016593432053923607, "mask/share_reasoning": 0.849403977394104, "mask/share_step_conf": 0.10275260359048843, "num_tokens": 14299500.0, "reward": 0.7554113268852234, "reward_std": 0.21753597259521484, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5662726163864136, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6429873704910278, "step": 45 }, { "adv/mean_abs_final_conf": 0.4345059394836426, "adv/mean_abs_reasoning": 0.4139566719532013, "adv/mean_abs_step_conf": 0.7318849563598633, "adv/ratio_final_to_reasoning": 1.0496411072044864, "adv/ratio_step_to_reasoning": 1.7680230950417064, "adv/std_final_conf": 0.7010535597801208, "adv/std_reasoning": 0.7015208005905151, "adv/std_step_conf": 0.9241446256637573, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.49860270144387525, "calib/avg_num_step_conf": 8.37109375, "calib/ece": 0.449479674796748, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -5.4028877503609074e-05, "calib/mean_conf": 0.990130081300813, "calib/mu_c": 0.9901052631578947, "calib/mu_w": 0.9901592920353983, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.449479674796748, "calib/std_conf": 0.0010356204659467386, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9008309608540925, "calib/step_q_c_n": 1124.0, "calib/step_q_gap": 0.004519871550854093, "calib/step_q_w": 0.8963110893032384, "calib/step_q_w_n": 1019.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2791.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 925.25, "completions/mean_terminated_length": 943.6812744140625, "completions/min_length": 0.0, "completions/min_terminated_length": 432.0, "epoch": 0.04906666666666667, "grad_norm": 0.018395164981484413, "kl": 0.017238616943359375, "learning_rate": 4.277777777777778e-06, "loss": -0.1028, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.016764050349593163, "mask/share_reasoning": 0.8601210713386536, "mask/share_step_conf": 0.10358359664678574, "num_tokens": 14641132.0, "reward": 0.7202885746955872, "reward_std": 0.2138737291097641, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5280463695526123, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6172181963920593, "step": 46 }, { "adv/mean_abs_final_conf": 0.43521004915237427, "adv/mean_abs_reasoning": 0.432475209236145, "adv/mean_abs_step_conf": 0.7342606782913208, "adv/ratio_final_to_reasoning": 1.0063236917581002, "adv/ratio_step_to_reasoning": 1.697809868889829, "adv/std_final_conf": 0.6987658739089966, "adv/std_reasoning": 0.7014555335044861, "adv/std_step_conf": 0.9218693375587463, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.49151234567901236, "calib/avg_num_step_conf": 7.94140625, "calib/ece": 0.397320987654321, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00029924242424239633, "calib/mean_conf": 0.9899135802469136, "calib/mu_c": 0.9897916666666666, "calib/mu_w": 0.990090909090909, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.397320987654321, "calib/std_conf": 0.002007378555059689, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8949606299212599, "calib/step_q_c_n": 1143.0, "calib/step_q_gap": 0.002522427674068961, "calib/step_q_w": 0.892438202247191, "calib/step_q_w_n": 890.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2475.0, "completions/max_terminated_length": 2475.0, "completions/mean_length": 868.7578125, "completions/mean_terminated_length": 896.7822265625, "completions/min_length": 0.0, "completions/min_terminated_length": 524.0, "epoch": 0.050133333333333335, "grad_norm": 0.023173121735453606, "kl": 0.019071578979492188, "learning_rate": 4.25e-06, "loss": -0.1861, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.016180016100406647, "mask/share_reasoning": 0.8588327169418335, "mask/share_step_conf": 0.09373725205659866, "num_tokens": 14969510.0, "reward": 0.7658748626708984, "reward_std": 0.20331431925296783, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5700636506080627, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6593424081802368, "step": 47 }, { "adv/mean_abs_final_conf": 0.6294046640396118, "adv/mean_abs_reasoning": 0.5752204656600952, "adv/mean_abs_step_conf": 0.7964880466461182, "adv/ratio_final_to_reasoning": 1.0941972715058694, "adv/ratio_step_to_reasoning": 1.3846656963641009, "adv/std_final_conf": 0.8177751302719116, "adv/std_reasoning": 0.7930737137794495, "adv/std_step_conf": 0.9295130968093872, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5138781533272433, "calib/avg_num_step_conf": 8.65625, "calib/ece": 0.43153749999999996, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.9958333333333333, "calib/gap": 0.009281357599606466, "calib/mean_conf": 0.9857041666666667, "calib/mu_c": 0.9898421052631579, "calib/mu_w": 0.9805607476635514, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43153749999999996, "calib/std_conf": 0.06379824957869577, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8976549053356283, "calib/step_q_c_n": 1162.0, "calib/step_q_gap": 0.0032526282957801023, "calib/step_q_w": 0.8944022770398482, "calib/step_q_w_n": 1054.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2969.0, "completions/max_terminated_length": 2969.0, "completions/mean_length": 828.91796875, "completions/mean_terminated_length": 859.1214599609375, "completions/min_length": 0.0, "completions/min_terminated_length": 475.0, "epoch": 0.0512, "grad_norm": 0.02995246835052967, "kl": 0.020765304565429688, "learning_rate": 4.222222222222223e-06, "loss": -0.1281, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017236867919564247, "mask/share_reasoning": 0.8404216766357422, "mask/share_step_conf": 0.1071852594614029, "num_tokens": 15285401.0, "reward": 0.727776288986206, "reward_std": 0.30010148882865906, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.531619131565094, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.6333083510398865, "step": 48 }, { "adv/mean_abs_final_conf": 0.44170159101486206, "adv/mean_abs_reasoning": 0.4147529602050781, "adv/mean_abs_step_conf": 0.7751125693321228, "adv/ratio_final_to_reasoning": 1.0649751379627501, "adv/ratio_step_to_reasoning": 1.868853615773741, "adv/std_final_conf": 0.6997136473655701, "adv/std_reasoning": 0.6818037033081055, "adv/std_step_conf": 0.925434947013855, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4942528735632184, "calib/avg_num_step_conf": 8.40234375, "calib/ece": 0.3465983606557377, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00011494252873545108, "calib/mean_conf": 0.9900409836065573, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9901149425287353, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3465983606557377, "calib/std_conf": 0.0006388711995131111, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8977125658389766, "calib/step_q_c_n": 1329.0, "calib/step_q_gap": 0.006617675328027706, "calib/step_q_w": 0.8910948905109489, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3055.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 813.41796875, "completions/mean_terminated_length": 843.0567016601562, "completions/min_length": 0.0, "completions/min_terminated_length": 466.0, "epoch": 0.05226666666666667, "grad_norm": 0.025971531867980957, "kl": 0.018663406372070312, "learning_rate": 4.194444444444445e-06, "loss": -0.1239, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01712140068411827, "mask/share_reasoning": 0.845194935798645, "mask/share_step_conf": 0.1025274246931076, "num_tokens": 15598172.0, "reward": 0.7944972515106201, "reward_std": 0.21691881120204926, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6159992218017578, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6604952216148376, "step": 49 }, { "adv/mean_abs_final_conf": 0.46414607763290405, "adv/mean_abs_reasoning": 0.43221813440322876, "adv/mean_abs_step_conf": 0.7735408544540405, "adv/ratio_final_to_reasoning": 1.0738699760336496, "adv/ratio_step_to_reasoning": 1.7897001372283514, "adv/std_final_conf": 0.721068799495697, "adv/std_reasoning": 0.7014055252075195, "adv/std_step_conf": 0.9300537705421448, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5085791264889393, "calib/avg_num_step_conf": 8.359375, "calib/ece": 0.33388000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00017158252977889088, "calib/mean_conf": 0.9898800000000001, "calib/mu_c": 0.9899390243902437, "calib/mu_w": 0.9897674418604648, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33388000000000007, "calib/std_conf": 0.001088852607105297, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8999010600706715, "calib/step_q_c_n": 1415.0, "calib/step_q_gap": 0.004038991105154266, "calib/step_q_w": 0.8958620689655172, "calib/step_q_w_n": 725.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 836.765625, "completions/mean_terminated_length": 846.6878051757812, "completions/min_length": 0.0, "completions/min_terminated_length": 458.0, "epoch": 0.05333333333333334, "grad_norm": 0.03273485600948334, "kl": 0.02405548095703125, "learning_rate": 4.166666666666667e-06, "loss": -0.064, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.017580296844244003, "mask/share_reasoning": 0.8621413707733154, "mask/share_step_conf": 0.10855959355831146, "num_tokens": 15917744.0, "reward": 0.82176673412323, "reward_std": 0.22579185664653778, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6473987698554993, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.6726971864700317, "step": 50 }, { "adv/mean_abs_final_conf": 0.43641749024391174, "adv/mean_abs_reasoning": 0.40805771946907043, "adv/mean_abs_step_conf": 0.7661856412887573, "adv/ratio_final_to_reasoning": 1.0694994100632151, "adv/ratio_step_to_reasoning": 1.8776403551087137, "adv/std_final_conf": 0.6907274127006531, "adv/std_reasoning": 0.6817567348480225, "adv/std_step_conf": 0.925344705581665, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4966442953020134, "calib/avg_num_step_conf": 8.12890625, "calib/ece": 0.37670781893004124, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002013422818791799, "calib/mean_conf": 0.9898765432098766, "calib/mu_c": 0.9897986577181206, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.37670781893004124, "calib/std_conf": 0.0019205369365560565, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8977551020408163, "calib/step_q_c_n": 1274.0, "calib/step_q_gap": 0.006379637356801582, "calib/step_q_w": 0.8913754646840147, "calib/step_q_w_n": 807.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 784.70703125, "completions/mean_terminated_length": 810.0201416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 71.0, "epoch": 0.0544, "grad_norm": 0.028437655419111252, "kl": 0.023008346557617188, "learning_rate": 4.138888888888889e-06, "loss": -0.1319, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.017778389155864716, "mask/share_reasoning": 0.8434497714042664, "mask/share_step_conf": 0.10752184689044952, "num_tokens": 16227925.0, "reward": 0.7702333927154541, "reward_std": 0.208187073469162, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5891964435577393, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.6458014249801636, "step": 51 }, { "adv/mean_abs_final_conf": 0.4756985306739807, "adv/mean_abs_reasoning": 0.41691356897354126, "adv/mean_abs_step_conf": 0.7791311740875244, "adv/ratio_final_to_reasoning": 1.1410003561293782, "adv/ratio_step_to_reasoning": 1.8688074269345039, "adv/std_final_conf": 0.7390729188919067, "adv/std_reasoning": 0.7014615535736084, "adv/std_step_conf": 0.9296015501022339, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5012301587301587, "calib/avg_num_step_conf": 7.96484375, "calib/ece": 0.28135222672064764, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 9.920634920335125e-06, "calib/mean_conf": 0.9898542510121456, "calib/mu_c": 0.9898571428571427, "calib/mu_w": 0.9898472222222223, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28135222672064764, "calib/std_conf": 0.0015595507455950335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8934075967859751, "calib/step_q_c_n": 1369.0, "calib/step_q_gap": -0.007855089781189095, "calib/step_q_w": 0.9012626865671642, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 875.765625, "completions/mean_terminated_length": 896.7840576171875, "completions/min_length": 0.0, "completions/min_terminated_length": 530.0, "epoch": 0.055466666666666664, "grad_norm": 0.03539959341287613, "kl": 0.02320098876953125, "learning_rate": 4.111111111111111e-06, "loss": -0.0999, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01645956188440323, "mask/share_reasoning": 0.8659517168998718, "mask/share_step_conf": 0.09415123611688614, "num_tokens": 16560073.0, "reward": 0.8869292736053467, "reward_std": 0.22455470263957977, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.6892030239105225, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.754967987537384, "step": 52 }, { "adv/mean_abs_final_conf": 0.5189765691757202, "adv/mean_abs_reasoning": 0.49131283164024353, "adv/mean_abs_step_conf": 0.7682706713676453, "adv/ratio_final_to_reasoning": 1.056305750133009, "adv/ratio_step_to_reasoning": 1.5637097626837477, "adv/std_final_conf": 0.7610706686973572, "adv/std_reasoning": 0.7577739357948303, "adv/std_step_conf": 0.9258795976638794, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.4946581196581197, "calib/avg_num_step_conf": 8.875, "calib/ece": 0.3068695121951218, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00010860805860757328, "calib/mean_conf": 0.9897963414634146, "calib/mu_c": 0.989761904761905, "calib/mu_w": 0.9898705128205125, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3068695121951218, "calib/std_conf": 0.001672113101696819, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.890580693815988, "calib/step_q_c_n": 1326.0, "calib/step_q_gap": -0.01226645206139032, "calib/step_q_w": 0.9028471458773784, "calib/step_q_w_n": 946.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2791.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 825.0234375, "completions/mean_terminated_length": 855.0850219726562, "completions/min_length": 0.0, "completions/min_terminated_length": 424.0, "epoch": 0.05653333333333333, "grad_norm": 0.031232628971338272, "kl": 0.039867401123046875, "learning_rate": 4.083333333333334e-06, "loss": -0.1512, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01677112840116024, "mask/share_reasoning": 0.8481372594833374, "mask/share_step_conf": 0.09993541985750198, "num_tokens": 16877103.0, "reward": 0.8562500476837158, "reward_std": 0.2774738669395447, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6623198986053467, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7267426252365112, "step": 53 }, { "adv/mean_abs_final_conf": 0.34591519832611084, "adv/mean_abs_reasoning": 0.3050154447555542, "adv/mean_abs_step_conf": 0.6924338936805725, "adv/ratio_final_to_reasoning": 1.1340907625295322, "adv/ratio_step_to_reasoning": 2.2701601036482058, "adv/std_final_conf": 0.6442092657089233, "adv/std_reasoning": 0.6185151934623718, "adv/std_step_conf": 0.9229856729507446, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.49490647229296564, "calib/avg_num_step_conf": 8.42578125, "calib/ece": 0.22404761904761905, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00010362694300536379, "calib/mean_conf": 0.989920634920635, "calib/mu_c": 0.9898963730569948, "calib/mu_w": 0.9900000000000002, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22404761904761905, "calib/std_conf": 0.0012573793268059424, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.893818407960199, "calib/step_q_c_n": 1608.0, "calib/step_q_gap": 0.00039399994562694385, "calib/step_q_w": 0.893424408014572, "calib/step_q_w_n": 549.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 804.60546875, "completions/mean_terminated_length": 810.94091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 416.0, "epoch": 0.0576, "grad_norm": 0.02178231254220009, "kl": 0.02564239501953125, "learning_rate": 4.055555555555556e-06, "loss": 0.0001, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018358973786234856, "mask/share_reasoning": 0.8616392612457275, "mask/share_step_conf": 0.11218926310539246, "num_tokens": 17189314.0, "reward": 0.9394882321357727, "reward_std": 0.15851876139640808, "rewards/accuracy_reward_step": 0.75390625, "rewards/final_brier_reward_step": 0.7584140300750732, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7729063034057617, "step": 54 }, { "adv/mean_abs_final_conf": 0.5040771961212158, "adv/mean_abs_reasoning": 0.3714796006679535, "adv/mean_abs_step_conf": 0.7354352474212646, "adv/ratio_final_to_reasoning": 1.3569444868973695, "adv/ratio_step_to_reasoning": 1.979745983625713, "adv/std_final_conf": 0.7724276185035706, "adv/std_reasoning": 0.681528627872467, "adv/std_step_conf": 0.9326515793800354, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5210697977821266, "calib/avg_num_step_conf": 8.36328125, "calib/ece": 0.40023904382470127, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9840637450199203, "calib/gap": 0.017532289628180098, "calib/mean_conf": 0.9819123505976095, "calib/mu_c": 0.9892465753424659, "calib/mu_w": 0.9717142857142858, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40023904382470127, "calib/std_conf": 0.06495116374757216, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.893628318584071, "calib/step_q_c_n": 1243.0, "calib/step_q_gap": 0.009686225042868313, "calib/step_q_w": 0.8839420935412027, "calib/step_q_w_n": 898.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1995.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 783.578125, "completions/mean_terminated_length": 799.187255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 477.0, "epoch": 0.058666666666666666, "grad_norm": 0.033670056611299515, "kl": 0.026943206787109375, "learning_rate": 4.027777777777779e-06, "loss": -0.1008, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018136566504836082, "mask/share_reasoning": 0.8516228795051575, "mask/share_step_conf": 0.1107092797756195, "num_tokens": 17497734.0, "reward": 0.7946262955665588, "reward_std": 0.20714420080184937, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5890586376190186, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6900376081466675, "step": 55 }, { "adv/mean_abs_final_conf": 0.5695196390151978, "adv/mean_abs_reasoning": 0.4986175298690796, "adv/mean_abs_step_conf": 0.741671085357666, "adv/ratio_final_to_reasoning": 1.1421973855687237, "adv/ratio_step_to_reasoning": 1.4874548946411976, "adv/std_final_conf": 0.7978363037109375, "adv/std_reasoning": 0.7577388882637024, "adv/std_step_conf": 0.9346494674682617, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.504842875591907, "calib/avg_num_step_conf": 8.1328125, "calib/ece": 0.4117154811715481, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.99581589958159, "calib/gap": 0.0008788922370498442, "calib/mean_conf": 0.9891213389121338, "calib/mu_c": 0.9894927536231884, "calib/mu_w": 0.9886138613861386, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4117154811715481, "calib/std_conf": 0.006243148938441668, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8859311562224185, "calib/step_q_c_n": 1133.0, "calib/step_q_gap": -0.003584123018888108, "calib/step_q_w": 0.8895152792413066, "calib/step_q_w_n": 949.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2915.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 831.46875, "completions/mean_terminated_length": 861.7651977539062, "completions/min_length": 0.0, "completions/min_terminated_length": 459.0, "epoch": 0.05973333333333333, "grad_norm": 0.019158584997057915, "kl": 0.02632904052734375, "learning_rate": 4.000000000000001e-06, "loss": -0.1299, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.01669345796108246, "mask/share_reasoning": 0.8484475612640381, "mask/share_step_conf": 0.09970273077487946, "num_tokens": 17817430.0, "reward": 0.7434085011482239, "reward_std": 0.2679101228713989, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.543994128704071, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.6490728259086609, "step": 56 }, { "adv/mean_abs_final_conf": 0.5240992307662964, "adv/mean_abs_reasoning": 0.4981672465801239, "adv/mean_abs_step_conf": 0.73002690076828, "adv/ratio_final_to_reasoning": 1.0520547755080114, "adv/ratio_step_to_reasoning": 1.4654253280998562, "adv/std_final_conf": 0.7817376852035522, "adv/std_reasoning": 0.7754046320915222, "adv/std_step_conf": 0.9265749454498291, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5551038843721771, "calib/avg_num_step_conf": 8.35546875, "calib/ece": 0.3196285714285714, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959183673469387, "calib/gap": 0.0020347034025894883, "calib/mean_conf": 0.9890163265306122, "calib/mu_c": 0.9896890243902438, "calib/mu_w": 0.9876543209876543, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3196285714285714, "calib/std_conf": 0.006306438140172847, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.894915191740413, "calib/step_q_c_n": 1356.0, "calib/step_q_gap": 0.016677643847692547, "calib/step_q_w": 0.8782375478927205, "calib/step_q_w_n": 783.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2789.0, "completions/max_terminated_length": 2789.0, "completions/mean_length": 792.07421875, "completions/mean_terminated_length": 824.2723388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 519.0, "epoch": 0.0608, "grad_norm": 0.02918510138988495, "kl": 0.024547576904296875, "learning_rate": 3.972222222222223e-06, "loss": -0.1605, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01758004166185856, "mask/share_reasoning": 0.8421605229377747, "mask/share_step_conf": 0.10119695961475372, "num_tokens": 18126993.0, "reward": 0.83964604139328, "reward_std": 0.261970579624176, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6482839584350586, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7114768028259277, "step": 57 }, { "adv/mean_abs_final_conf": 0.5979112386703491, "adv/mean_abs_reasoning": 0.5257660746574402, "adv/mean_abs_step_conf": 0.7536758184432983, "adv/ratio_final_to_reasoning": 1.1372191312646307, "adv/ratio_step_to_reasoning": 1.4334812662348961, "adv/std_final_conf": 0.8220943808555603, "adv/std_reasoning": 0.7928545475006104, "adv/std_step_conf": 0.9348974823951721, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.45082572502685286, "calib/avg_num_step_conf": 8.4375, "calib/ece": 0.44456734693877564, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9918367346938776, "calib/gap": -0.0006325187969927759, "calib/mean_conf": 0.9874244897959185, "calib/mu_c": 0.9871353383458645, "calib/mu_w": 0.9877678571428573, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.44456734693877564, "calib/std_conf": 0.009137465611476537, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8876113657195234, "calib/step_q_c_n": 1091.0, "calib/step_q_gap": 0.007789101921581443, "calib/step_q_w": 0.8798222637979419, "calib/step_q_w_n": 1069.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2950.0, "completions/max_terminated_length": 2950.0, "completions/mean_length": 906.1640625, "completions/mean_terminated_length": 924.2151489257812, "completions/min_length": 0.0, "completions/min_terminated_length": 486.0, "epoch": 0.06186666666666667, "grad_norm": 0.032789744436740875, "kl": 0.025289535522460938, "learning_rate": 3.944444444444445e-06, "loss": -0.072, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01625051349401474, "mask/share_reasoning": 0.8657860159873962, "mask/share_step_conf": 0.09843222051858902, "num_tokens": 18465291.0, "reward": 0.723969578742981, "reward_std": 0.2588580250740051, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5299254059791565, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6234825849533081, "step": 58 }, { "adv/mean_abs_final_conf": 0.6285378932952881, "adv/mean_abs_reasoning": 0.5240025520324707, "adv/mean_abs_step_conf": 0.7756651639938354, "adv/ratio_final_to_reasoning": 1.199493954480473, "adv/ratio_step_to_reasoning": 1.4802698211778367, "adv/std_final_conf": 0.7962230443954468, "adv/std_reasoning": 0.7577937245368958, "adv/std_step_conf": 0.9284113049507141, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.49198454161597366, "calib/avg_num_step_conf": 8.18359375, "calib/ece": 0.35284552845528466, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.991869918699187, "calib/gap": -0.00509697273312848, "calib/mean_conf": 0.9829268292682928, "calib/mu_c": 0.9810828025477705, "calib/mu_w": 0.986179775280899, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34878048780487814, "calib/std_conf": 0.06326138455744801, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8827112135176651, "calib/step_q_c_n": 1302.0, "calib/step_q_gap": -0.002559908802637456, "calib/step_q_w": 0.8852711223203026, "calib/step_q_w_n": 793.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 826.66796875, "completions/mean_terminated_length": 846.508056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 367.0, "epoch": 0.06293333333333333, "grad_norm": 0.03274433687329292, "kl": 0.02954864501953125, "learning_rate": 3.916666666666667e-06, "loss": -0.142, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017488282173871994, "mask/share_reasoning": 0.8532015085220337, "mask/share_step_conf": 0.10587271302938461, "num_tokens": 18783166.0, "reward": 0.8164454698562622, "reward_std": 0.2749021649360657, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6187648177146912, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6992822289466858, "step": 59 }, { "adv/mean_abs_final_conf": 0.6482319831848145, "adv/mean_abs_reasoning": 0.5000711679458618, "adv/mean_abs_step_conf": 0.7497437000274658, "adv/ratio_final_to_reasoning": 1.2962794592768696, "adv/ratio_step_to_reasoning": 1.4992739995532671, "adv/std_final_conf": 0.8127201795578003, "adv/std_reasoning": 0.7576885223388672, "adv/std_step_conf": 0.9321392178535461, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.44521059782608696, "calib/avg_num_step_conf": 8.8984375, "calib/ece": 0.45851851851851855, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9876543209876543, "calib/gap": -0.0008953804347823224, "calib/mean_conf": 0.9852674897119343, "calib/mu_c": 0.9848437500000002, "calib/mu_w": 0.9857391304347826, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45851851851851855, "calib/std_conf": 0.012998514186810313, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8873805147058824, "calib/step_q_c_n": 1088.0, "calib/step_q_gap": -0.004892594537815098, "calib/step_q_w": 0.8922731092436975, "calib/step_q_w_n": 1190.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2874.0, "completions/max_terminated_length": 2874.0, "completions/mean_length": 825.25390625, "completions/mean_terminated_length": 855.3239135742188, "completions/min_length": 0.0, "completions/min_terminated_length": 439.0, "epoch": 0.064, "grad_norm": 0.037229057401418686, "kl": 0.03716278076171875, "learning_rate": 3.88888888888889e-06, "loss": -0.1014, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01703135296702385, "mask/share_reasoning": 0.8419076204299927, "mask/share_step_conf": 0.10590481013059616, "num_tokens": 19103287.0, "reward": 0.7158203125, "reward_std": 0.2458280324935913, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5124461054801941, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6293507218360901, "step": 60 }, { "adv/mean_abs_final_conf": 0.6403580904006958, "adv/mean_abs_reasoning": 0.41955268383026123, "adv/mean_abs_step_conf": 0.7369972467422485, "adv/ratio_final_to_reasoning": 1.5262876751368035, "adv/ratio_step_to_reasoning": 1.7566262239438233, "adv/std_final_conf": 0.7925984859466553, "adv/std_reasoning": 0.7014154195785522, "adv/std_step_conf": 0.9274880290031433, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4565770348837209, "calib/avg_num_step_conf": 8.46484375, "calib/ece": 0.30206349206349203, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9920634920634921, "calib/gap": 0.0010668604651163927, "calib/mean_conf": 0.9846031746031746, "calib/mu_c": 0.9849418604651164, "calib/mu_w": 0.983875, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30206349206349203, "calib/std_conf": 0.009098298606934282, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8922908093278464, "calib/step_q_c_n": 1458.0, "calib/step_q_gap": 0.014759074490046653, "calib/step_q_w": 0.8775317348377998, "calib/step_q_w_n": 709.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2080.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 786.16015625, "completions/mean_terminated_length": 795.4822387695312, "completions/min_length": 0.0, "completions/min_terminated_length": 380.0, "epoch": 0.06506666666666666, "grad_norm": 0.025057543069124222, "kl": 0.03397369384765625, "learning_rate": 3.861111111111112e-06, "loss": -0.0575, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01846371404826641, "mask/share_reasoning": 0.8561471700668335, "mask/share_step_conf": 0.11367038637399673, "num_tokens": 19408608.0, "reward": 0.8597561120986938, "reward_std": 0.2305433750152588, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6777324080467224, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7113110423088074, "step": 61 }, { "adv/mean_abs_final_conf": 0.7482225298881531, "adv/mean_abs_reasoning": 0.5922585129737854, "adv/mean_abs_step_conf": 0.7756792306900024, "adv/ratio_final_to_reasoning": 1.2633377376565813, "adv/ratio_step_to_reasoning": 1.3096970557590544, "adv/std_final_conf": 0.8975844979286194, "adv/std_reasoning": 0.826655924320221, "adv/std_step_conf": 0.934055745601654, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4916534181240063, "calib/avg_num_step_conf": 8.140625, "calib/ece": 0.43056680161943334, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9878542510121457, "calib/gap": 0.00998608903020648, "calib/mean_conf": 0.9811740890688261, "calib/mu_c": 0.9856617647058824, "calib/mu_w": 0.9756756756756759, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43056680161943334, "calib/std_conf": 0.06321864616836151, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8915182755388941, "calib/step_q_c_n": 1067.0, "calib/step_q_gap": 0.009945020868294296, "calib/step_q_w": 0.8815732546705998, "calib/step_q_w_n": 1017.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1956.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 803.88671875, "completions/mean_terminated_length": 816.6468505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 481.0, "epoch": 0.06613333333333334, "grad_norm": 0.03266182541847229, "kl": 0.036937713623046875, "learning_rate": 3.833333333333334e-06, "loss": -0.0886, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.0176240187138319, "mask/share_reasoning": 0.8623931407928467, "mask/share_step_conf": 0.10435787588357925, "num_tokens": 19721483.0, "reward": 0.735969603061676, "reward_std": 0.31463518738746643, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5481457114219666, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6245747804641724, "step": 62 }, { "adv/mean_abs_final_conf": 0.6453641653060913, "adv/mean_abs_reasoning": 0.4795626997947693, "adv/mean_abs_step_conf": 0.735736608505249, "adv/ratio_final_to_reasoning": 1.3457346986791872, "adv/ratio_step_to_reasoning": 1.5341823057133308, "adv/std_final_conf": 0.8454790711402893, "adv/std_reasoning": 0.7753329873085022, "adv/std_step_conf": 0.9310802221298218, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.4635060934779697, "calib/avg_num_step_conf": 8.34765625, "calib/ece": 0.44995918367346965, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959183673469387, "calib/gap": -0.001305075666264921, "calib/mean_conf": 0.98465306122449, "calib/mu_c": 0.9840458015267177, "calib/mu_w": 0.9853508771929826, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44995918367346965, "calib/std_conf": 0.007366640251029523, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8937358490566037, "calib/step_q_c_n": 1060.0, "calib/step_q_gap": -0.0007488306091345542, "calib/step_q_w": 0.8944846796657383, "calib/step_q_w_n": 1077.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2105.0, "completions/max_terminated_length": 2105.0, "completions/mean_length": 838.203125, "completions/mean_terminated_length": 858.320068359375, "completions/min_length": 0.0, "completions/min_terminated_length": 439.0, "epoch": 0.0672, "grad_norm": 0.02464851178228855, "kl": 0.03795623779296875, "learning_rate": 3.8055555555555556e-06, "loss": -0.1238, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01703377068042755, "mask/share_reasoning": 0.8587255477905273, "mask/share_step_conf": 0.10080315172672272, "num_tokens": 20044703.0, "reward": 0.7243086695671082, "reward_std": 0.24416929483413696, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5244882702827454, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6303789615631104, "step": 63 }, { "adv/mean_abs_final_conf": 0.5989560484886169, "adv/mean_abs_reasoning": 0.3883668780326843, "adv/mean_abs_step_conf": 0.7467473745346069, "adv/ratio_final_to_reasoning": 1.5422428697387778, "adv/ratio_step_to_reasoning": 1.9227885197556984, "adv/std_final_conf": 0.7767327427864075, "adv/std_reasoning": 0.6816226243972778, "adv/std_step_conf": 0.9268690347671509, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.44446606475716055, "calib/avg_num_step_conf": 8.51953125, "calib/ece": 0.27610441767068294, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9879518072289156, "calib/gap": 0.0006592465753426646, "calib/mean_conf": 0.9829317269076308, "calib/mu_c": 0.9831250000000001, "calib/mu_w": 0.9824657534246575, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27610441767068294, "calib/std_conf": 0.010596627363252014, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8877248322147652, "calib/step_q_c_n": 1490.0, "calib/step_q_gap": 0.001545382142406182, "calib/step_q_w": 0.886179450072359, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2988.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 829.1953125, "completions/mean_terminated_length": 835.7244262695312, "completions/min_length": 0.0, "completions/min_terminated_length": 476.0, "epoch": 0.06826666666666667, "grad_norm": 0.04055461660027504, "kl": 0.04143524169921875, "learning_rate": 3.777777777777778e-06, "loss": -0.0142, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.017893537878990173, "mask/share_reasoning": 0.8629002571105957, "mask/share_step_conf": 0.11139366030693054, "num_tokens": 20360753.0, "reward": 0.8846273422241211, "reward_std": 0.19556494057178497, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.6971074342727661, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7401160001754761, "step": 64 }, { "adv/mean_abs_final_conf": 0.6421319246292114, "adv/mean_abs_reasoning": 0.34305283427238464, "adv/mean_abs_step_conf": 0.7524635791778564, "adv/ratio_final_to_reasoning": 1.8718164098284562, "adv/ratio_step_to_reasoning": 2.193433500626317, "adv/std_final_conf": 0.8032649755477905, "adv/std_reasoning": 0.6401863694190979, "adv/std_step_conf": 0.9258484840393066, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5481290322580645, "calib/avg_num_step_conf": 8.38671875, "calib/ece": 0.3761960784313728, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.0017096774193545716, "calib/mean_conf": 0.9840392156862747, "calib/mu_c": 0.9847096774193548, "calib/mu_w": 0.9830000000000002, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3761960784313728, "calib/std_conf": 0.007232716475857886, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8924881516587677, "calib/step_q_c_n": 1266.0, "calib/step_q_gap": 0.0016708985373147245, "calib/step_q_w": 0.890817253121453, "calib/step_q_w_n": 881.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 737.390625, "completions/mean_terminated_length": 740.2824096679688, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.06933333333333333, "grad_norm": 0.04465280845761299, "kl": 0.046863555908203125, "learning_rate": 3.7500000000000005e-06, "loss": -0.0059, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.019956696778535843, "mask/share_reasoning": 0.855435311794281, "mask/share_step_conf": 0.12070174515247345, "num_tokens": 20654549.0, "reward": 0.8108848929405212, "reward_std": 0.18426105380058289, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6184441447257996, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.6830130219459534, "step": 65 }, { "adv/mean_abs_final_conf": 0.683743417263031, "adv/mean_abs_reasoning": 0.46175944805145264, "adv/mean_abs_step_conf": 0.7483415603637695, "adv/ratio_final_to_reasoning": 1.4807350886880897, "adv/ratio_step_to_reasoning": 1.6206307494554693, "adv/std_final_conf": 0.8428446054458618, "adv/std_reasoning": 0.7392820715904236, "adv/std_step_conf": 0.9290730953216553, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5531841414560021, "calib/avg_num_step_conf": 8.11328125, "calib/ece": 0.45605809128630714, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.995850622406639, "calib/gap": 0.0014200856471887624, "calib/mean_conf": 0.9830290456431536, "calib/mu_c": 0.983700787401575, "calib/mu_w": 0.9822807017543862, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45605809128630714, "calib/std_conf": 0.008420681091429592, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8887060998151571, "calib/step_q_c_n": 1082.0, "calib/step_q_gap": 0.021178461624202338, "calib/step_q_w": 0.8675276381909548, "calib/step_q_w_n": 995.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2664.0, "completions/max_terminated_length": 2664.0, "completions/mean_length": 886.8671875, "completions/mean_terminated_length": 904.5338745117188, "completions/min_length": 0.0, "completions/min_terminated_length": 447.0, "epoch": 0.0704, "grad_norm": 0.028026781976222992, "kl": 0.038791656494140625, "learning_rate": 3.7222222222222225e-06, "loss": -0.0837, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.016637243330478668, "mask/share_reasoning": 0.8638635873794556, "mask/share_step_conf": 0.09996792674064636, "num_tokens": 20987939.0, "reward": 0.7157400250434875, "reward_std": 0.21129858493804932, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5115370750427246, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.6324428915977478, "step": 66 }, { "adv/mean_abs_final_conf": 0.5312223434448242, "adv/mean_abs_reasoning": 0.30277425050735474, "adv/mean_abs_step_conf": 0.7451149821281433, "adv/ratio_final_to_reasoning": 1.7545162528010956, "adv/ratio_step_to_reasoning": 2.460958885636953, "adv/std_final_conf": 0.7096737623214722, "adv/std_reasoning": 0.5961373448371887, "adv/std_step_conf": 0.9236457347869873, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5587776517300056, "calib/avg_num_step_conf": 8.58984375, "calib/ece": 0.3273880000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.992, "calib/gap": 0.0028957742484401194, "calib/mean_conf": 0.9833880000000002, "calib/mu_c": 0.9843841463414635, "calib/mu_w": 0.9814883720930234, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3273880000000001, "calib/std_conf": 0.00901451363080671, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8887572953736657, "calib/step_q_c_n": 1405.0, "calib/step_q_gap": 0.0033353810159830743, "calib/step_q_w": 0.8854219143576826, "calib/step_q_w_n": 794.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 846.890625, "completions/mean_terminated_length": 860.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 491.0, "epoch": 0.07146666666666666, "grad_norm": 0.05231475085020065, "kl": 0.04485321044921875, "learning_rate": 3.694444444444445e-06, "loss": -0.0351, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.017291639000177383, "mask/share_reasoning": 0.8610464930534363, "mask/share_step_conf": 0.10603685677051544, "num_tokens": 21309751.0, "reward": 0.8452394008636475, "reward_std": 0.17514097690582275, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6527136564254761, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7143276333808899, "step": 67 }, { "adv/mean_abs_final_conf": 0.6500144004821777, "adv/mean_abs_reasoning": 0.41077685356140137, "adv/mean_abs_step_conf": 0.763282835483551, "adv/ratio_final_to_reasoning": 1.5824026958836814, "adv/ratio_step_to_reasoning": 1.858144705248, "adv/std_final_conf": 0.8139645457267761, "adv/std_reasoning": 0.6816565990447998, "adv/std_step_conf": 0.9272443652153015, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5556972789115646, "calib/avg_num_step_conf": 8.61328125, "calib/ece": 0.37884773662551463, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.9958847736625515, "calib/gap": 0.0009204931972788266, "calib/mean_conf": 0.9837860082304529, "calib/mu_c": 0.9841496598639456, "calib/mu_w": 0.9832291666666668, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37884773662551463, "calib/std_conf": 0.007675752112042767, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8871908396946565, "calib/step_q_c_n": 1310.0, "calib/step_q_gap": 0.026710392767282265, "calib/step_q_w": 0.8604804469273742, "calib/step_q_w_n": 895.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 812.13671875, "completions/mean_terminated_length": 828.3147583007812, "completions/min_length": 0.0, "completions/min_terminated_length": 387.0, "epoch": 0.07253333333333334, "grad_norm": 0.02631233073771, "kl": 0.04804229736328125, "learning_rate": 3.6666666666666666e-06, "loss": -0.0813, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.018240146338939667, "mask/share_reasoning": 0.8457576632499695, "mask/share_step_conf": 0.11647099256515503, "num_tokens": 21621746.0, "reward": 0.7804622054100037, "reward_std": 0.1851496696472168, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.582507848739624, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.6752915382385254, "step": 68 }, { "adv/mean_abs_final_conf": 0.6945899724960327, "adv/mean_abs_reasoning": 0.5294466614723206, "adv/mean_abs_step_conf": 0.7615878582000732, "adv/ratio_final_to_reasoning": 1.3119168049232204, "adv/ratio_step_to_reasoning": 1.4384600255712237, "adv/std_final_conf": 0.8556989431381226, "adv/std_reasoning": 0.7754108309745789, "adv/std_step_conf": 0.9333360195159912, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5152447552447552, "calib/avg_num_step_conf": 8.15625, "calib/ece": 0.3955967078189303, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9958847736625515, "calib/gap": -0.00021398601398614492, "calib/mean_conf": 0.9840740740740743, "calib/mu_c": 0.9839860139860143, "calib/mu_w": 0.9842000000000004, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3955967078189303, "calib/std_conf": 0.007332959212304939, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8818712955122779, "calib/step_q_c_n": 1181.0, "calib/step_q_gap": 0.0006915821715943737, "calib/step_q_w": 0.8811797133406836, "calib/step_q_w_n": 907.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2943.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 840.9140625, "completions/mean_terminated_length": 868.040283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 514.0, "epoch": 0.0736, "grad_norm": 0.029118498787283897, "kl": 0.043987274169921875, "learning_rate": 3.638888888888889e-06, "loss": -0.1294, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.016698405146598816, "mask/share_reasoning": 0.8528545498847961, "mask/share_step_conf": 0.09919705986976624, "num_tokens": 21941516.0, "reward": 0.769717812538147, "reward_std": 0.2492179572582245, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5706456899642944, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6672273874282837, "step": 69 }, { "adv/mean_abs_final_conf": 0.6649423241615295, "adv/mean_abs_reasoning": 0.4383116364479065, "adv/mean_abs_step_conf": 0.742194652557373, "adv/ratio_final_to_reasoning": 1.5170537783350824, "adv/ratio_step_to_reasoning": 1.693303555826502, "adv/std_final_conf": 0.8065817356109619, "adv/std_reasoning": 0.7205896973609924, "adv/std_step_conf": 0.9320774078369141, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5444521497919557, "calib/avg_num_step_conf": 9.32421875, "calib/ece": 0.40720164609053516, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9876543209876543, "calib/gap": 0.005280166435506439, "calib/mean_conf": 0.9833333333333335, "calib/mu_c": 0.9855714285714287, "calib/mu_w": 0.9802912621359222, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40720164609053516, "calib/std_conf": 0.020507952888751305, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8900559552358114, "calib/step_q_c_n": 1251.0, "calib/step_q_gap": -0.00013770673601964933, "calib/step_q_w": 0.890193661971831, "calib/step_q_w_n": 1136.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2640.0, "completions/max_terminated_length": 2640.0, "completions/mean_length": 817.83984375, "completions/mean_terminated_length": 854.5591430664062, "completions/min_length": 0.0, "completions/min_terminated_length": 460.0, "epoch": 0.07466666666666667, "grad_norm": 0.04475827515125275, "kl": 0.048248291015625, "learning_rate": 3.6111111111111115e-06, "loss": -0.2043, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.017057053744792938, "mask/share_reasoning": 0.832466721534729, "mask/share_step_conf": 0.10750746726989746, "num_tokens": 22257875.0, "reward": 0.7592463493347168, "reward_std": 0.2055116891860962, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5620715022087097, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6572023630142212, "step": 70 }, { "adv/mean_abs_final_conf": 0.6414079666137695, "adv/mean_abs_reasoning": 0.4914470911026001, "adv/mean_abs_step_conf": 0.7306813597679138, "adv/ratio_final_to_reasoning": 1.3051414449818401, "adv/ratio_step_to_reasoning": 1.486795573717962, "adv/std_final_conf": 0.8314691781997681, "adv/std_reasoning": 0.7753694653511047, "adv/std_step_conf": 0.9239351749420166, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.48824191838897724, "calib/avg_num_step_conf": 8.6953125, "calib/ece": 0.4347368421052634, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9919028340080972, "calib/gap": -0.00011128775834678972, "calib/mean_conf": 0.9853441295546561, "calib/mu_c": 0.9852941176470589, "calib/mu_w": 0.9854054054054057, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4347368421052634, "calib/std_conf": 0.009120371154252042, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8855157715260017, "calib/step_q_c_n": 1173.0, "calib/step_q_gap": -0.001283848606951743, "calib/step_q_w": 0.8867996201329534, "calib/step_q_w_n": 1053.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 847.86328125, "completions/mean_terminated_length": 861.3214721679688, "completions/min_length": 0.0, "completions/min_terminated_length": 498.0, "epoch": 0.07573333333333333, "grad_norm": 0.026600291952490807, "kl": 0.044338226318359375, "learning_rate": 3.5833333333333335e-06, "loss": -0.0945, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017498448491096497, "mask/share_reasoning": 0.8599810600280762, "mask/share_step_conf": 0.10689544677734375, "num_tokens": 22579336.0, "reward": 0.7447740435600281, "reward_std": 0.24545089900493622, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5397129058837891, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6513975858688354, "step": 71 }, { "adv/mean_abs_final_conf": 0.6619936227798462, "adv/mean_abs_reasoning": 0.5136440396308899, "adv/mean_abs_step_conf": 0.7351285219192505, "adv/ratio_final_to_reasoning": 1.2888178810671334, "adv/ratio_step_to_reasoning": 1.431202282513629, "adv/std_final_conf": 0.867818295955658, "adv/std_reasoning": 0.7926632761955261, "adv/std_step_conf": 0.9319378733634949, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4561651583710407, "calib/avg_num_step_conf": 8.15234375, "calib/ece": 0.44885375494071145, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": -0.00144419306183996, "calib/mean_conf": 0.986403162055336, "calib/mu_c": 0.985735294117647, "calib/mu_w": 0.987179487179487, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44885375494071145, "calib/std_conf": 0.0071756277962406025, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8852765583845479, "calib/step_q_c_n": 1139.0, "calib/step_q_gap": -0.005419644147097835, "calib/step_q_w": 0.8906962025316457, "calib/step_q_w_n": 948.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 786.3046875, "completions/mean_terminated_length": 795.6284790039062, "completions/min_length": 0.0, "completions/min_terminated_length": 490.0, "epoch": 0.0768, "grad_norm": 0.02710404060781002, "kl": 0.046268463134765625, "learning_rate": 3.555555555555556e-06, "loss": -0.0352, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.018414214253425598, "mask/share_reasoning": 0.8599804639816284, "mask/share_step_conf": 0.10988658666610718, "num_tokens": 22885038.0, "reward": 0.7634243965148926, "reward_std": 0.26252222061157227, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5388292670249939, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.6848944425582886, "step": 72 }, { "adv/mean_abs_final_conf": 0.6329753994941711, "adv/mean_abs_reasoning": 0.48065903782844543, "adv/mean_abs_step_conf": 0.7773274183273315, "adv/ratio_final_to_reasoning": 1.3168906640221956, "adv/ratio_step_to_reasoning": 1.6172116971714399, "adv/std_final_conf": 0.7763375639915466, "adv/std_reasoning": 0.720600962638855, "adv/std_step_conf": 0.9217331409454346, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4776785714285714, "calib/avg_num_step_conf": 8.37890625, "calib/ece": 0.3250393700787403, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.00044435215946891926, "calib/mean_conf": 0.986456692913386, "calib/mu_c": 0.986607142857143, "calib/mu_w": 0.9861627906976741, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3250393700787403, "calib/std_conf": 0.0071571294838786385, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8882424677187949, "calib/step_q_c_n": 1394.0, "calib/step_q_gap": 0.0009455303020172723, "calib/step_q_w": 0.8872969374167776, "calib/step_q_w_n": 751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 782.203125, "completions/mean_terminated_length": 788.3621826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 460.0, "epoch": 0.07786666666666667, "grad_norm": 0.02680094726383686, "kl": 0.044742584228515625, "learning_rate": 3.5277777777777784e-06, "loss": -0.0179, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018433989956974983, "mask/share_reasoning": 0.863990068435669, "mask/share_step_conf": 0.10976342111825943, "num_tokens": 23192314.0, "reward": 0.8560742139816284, "reward_std": 0.23352429270744324, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6653140783309937, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7171468138694763, "step": 73 }, { "adv/mean_abs_final_conf": 0.5732172131538391, "adv/mean_abs_reasoning": 0.3150753974914551, "adv/mean_abs_step_conf": 0.7501686215400696, "adv/ratio_final_to_reasoning": 1.8193017218025882, "adv/ratio_step_to_reasoning": 2.3809177978119167, "adv/std_final_conf": 0.7667513489723206, "adv/std_reasoning": 0.6185932159423828, "adv/std_step_conf": 0.9282718896865845, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5352107993084186, "calib/avg_num_step_conf": 8.390625, "calib/ece": 0.4000803212851406, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": 0.001520813938023835, "calib/mean_conf": 0.986425702811245, "calib/mu_c": 0.987054794520548, "calib/mu_w": 0.9855339805825242, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4000803212851406, "calib/std_conf": 0.007258529258466635, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8932904263877715, "calib/step_q_c_n": 1243.0, "calib/step_q_gap": 0.018273851802136032, "calib/step_q_w": 0.8750165745856354, "calib/step_q_w_n": 905.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 815.6015625, "completions/mean_terminated_length": 828.5476684570312, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.07893333333333333, "grad_norm": 0.02454979531466961, "kl": 0.043239593505859375, "learning_rate": 3.5e-06, "loss": -0.0846, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018031660467386246, "mask/share_reasoning": 0.8548358082771301, "mask/share_step_conf": 0.11150752007961273, "num_tokens": 23505036.0, "reward": 0.8020550012588501, "reward_std": 0.17195594310760498, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5817226767539978, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7137935161590576, "step": 74 }, { "adv/mean_abs_final_conf": 0.602027416229248, "adv/mean_abs_reasoning": 0.387719064950943, "adv/mean_abs_step_conf": 0.7657424807548523, "adv/ratio_final_to_reasoning": 1.552741329099772, "adv/ratio_step_to_reasoning": 1.974993107062557, "adv/std_final_conf": 0.7324143648147583, "adv/std_reasoning": 0.6404212713241577, "adv/std_step_conf": 0.9212512969970703, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4856928838951311, "calib/avg_num_step_conf": 8.65625, "calib/ece": 0.2824901185770751, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": 0.0030996254681646684, "calib/mean_conf": 0.9860474308300397, "calib/mu_c": 0.9869662921348314, "calib/mu_w": 0.9838666666666668, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2824901185770751, "calib/std_conf": 0.013694497153698134, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8953473132372216, "calib/step_q_c_n": 1526.0, "calib/step_q_gap": 0.025448762512583878, "calib/step_q_w": 0.8698985507246377, "calib/step_q_w_n": 690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1944.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 770.1796875, "completions/mean_terminated_length": 779.312255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 460.0, "epoch": 0.08, "grad_norm": 0.03222297132015228, "kl": 0.044467926025390625, "learning_rate": 3.4722222222222224e-06, "loss": -0.0331, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01898212358355522, "mask/share_reasoning": 0.8517194986343384, "mask/share_step_conf": 0.11757959425449371, "num_tokens": 23806954.0, "reward": 0.897788941860199, "reward_std": 0.20893093943595886, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7043101787567139, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7553301453590393, "step": 75 }, { "adv/mean_abs_final_conf": 0.5657827258110046, "adv/mean_abs_reasoning": 0.45047199726104736, "adv/mean_abs_step_conf": 0.720933198928833, "adv/ratio_final_to_reasoning": 1.2559775729702793, "adv/ratio_step_to_reasoning": 1.60039514844927, "adv/std_final_conf": 0.7905355095863342, "adv/std_reasoning": 0.739392876625061, "adv/std_step_conf": 0.924590528011322, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.463395979020979, "calib/avg_num_step_conf": 8.546875, "calib/ece": 0.3495040983606558, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9918032786885246, "calib/gap": -0.002392482517482475, "calib/mean_conf": 0.9872090163934427, "calib/mu_c": 0.9863461538461539, "calib/mu_w": 0.9887386363636363, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3486844262295083, "calib/std_conf": 0.013801927307197861, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8927719033232628, "calib/step_q_c_n": 1324.0, "calib/step_q_gap": 0.006402690360299945, "calib/step_q_w": 0.8863692129629629, "calib/step_q_w_n": 864.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2912.0, "completions/max_terminated_length": 2912.0, "completions/mean_length": 872.39453125, "completions/mean_terminated_length": 889.77294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 479.0, "epoch": 0.08106666666666666, "grad_norm": 0.025584116578102112, "kl": 0.03765106201171875, "learning_rate": 3.444444444444445e-06, "loss": -0.0472, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01710599660873413, "mask/share_reasoning": 0.8587711453437805, "mask/share_step_conf": 0.10459160804748535, "num_tokens": 24133343.0, "reward": 0.8029855489730835, "reward_std": 0.23085039854049683, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6128737926483154, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.680597186088562, "step": 76 }, { "adv/mean_abs_final_conf": 0.543570876121521, "adv/mean_abs_reasoning": 0.48223191499710083, "adv/mean_abs_step_conf": 0.7708549499511719, "adv/ratio_final_to_reasoning": 1.1271980539172504, "adv/ratio_step_to_reasoning": 1.5985150007249236, "adv/std_final_conf": 0.7704401612281799, "adv/std_reasoning": 0.7395368814468384, "adv/std_step_conf": 0.9321821928024292, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.4929743339831059, "calib/avg_num_step_conf": 8.58203125, "calib/ece": 0.2843621399176955, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9958847736625515, "calib/gap": 0.0015935672514618693, "calib/mean_conf": 0.9880658436213993, "calib/mu_c": 0.9885380116959064, "calib/mu_w": 0.9869444444444445, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2843621399176955, "calib/std_conf": 0.009556190009526783, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8938732394366199, "calib/step_q_c_n": 1420.0, "calib/step_q_gap": 0.03588096144434194, "calib/step_q_w": 0.857992277992278, "calib/step_q_w_n": 777.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2766.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 813.8046875, "completions/mean_terminated_length": 843.45751953125, "completions/min_length": 0.0, "completions/min_terminated_length": 450.0, "epoch": 0.08213333333333334, "grad_norm": 0.01862136833369732, "kl": 0.036975860595703125, "learning_rate": 3.416666666666667e-06, "loss": -0.1461, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.017553173005580902, "mask/share_reasoning": 0.8386209607124329, "mask/share_step_conf": 0.10866963863372803, "num_tokens": 24446341.0, "reward": 0.8542374968528748, "reward_std": 0.26453351974487305, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.6750906109809875, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.709946870803833, "step": 77 }, { "adv/mean_abs_final_conf": 0.5738725662231445, "adv/mean_abs_reasoning": 0.4448193609714508, "adv/mean_abs_step_conf": 0.7464429140090942, "adv/ratio_final_to_reasoning": 1.2901249733596387, "adv/ratio_step_to_reasoning": 1.6780809908519294, "adv/std_final_conf": 0.789870023727417, "adv/std_reasoning": 0.7205432057380676, "adv/std_step_conf": 0.9322795271873474, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5005253572429252, "calib/avg_num_step_conf": 8.08203125, "calib/ece": 0.3290079365079366, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920634920634921, "calib/gap": 0.0004497057999441889, "calib/mean_conf": 0.9877380952380953, "calib/mu_c": 0.9878915662650601, "calib/mu_w": 0.987441860465116, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3290079365079366, "calib/std_conf": 0.00863977381706126, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8860716952949963, "calib/step_q_c_n": 1339.0, "calib/step_q_gap": 0.00035936652787293966, "calib/step_q_w": 0.8857123287671234, "calib/step_q_w_n": 730.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2601.0, "completions/max_terminated_length": 2601.0, "completions/mean_length": 881.0703125, "completions/mean_terminated_length": 881.0703125, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.0832, "grad_norm": 0.02497708797454834, "kl": 0.039813995361328125, "learning_rate": 3.3888888888888893e-06, "loss": 0.0272, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.017089759930968285, "mask/share_reasoning": 0.8821457624435425, "mask/share_step_conf": 0.1007644385099411, "num_tokens": 24779919.0, "reward": 0.8477307558059692, "reward_std": 0.221160426735878, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6566535234451294, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7122455835342407, "step": 78 }, { "adv/mean_abs_final_conf": 0.4717371463775635, "adv/mean_abs_reasoning": 0.3694051206111908, "adv/mean_abs_step_conf": 0.7264292240142822, "adv/ratio_final_to_reasoning": 1.277018427890392, "adv/ratio_step_to_reasoning": 1.9664839047503873, "adv/std_final_conf": 0.7244833111763, "adv/std_reasoning": 0.640355110168457, "adv/std_step_conf": 0.930465817451477, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5197294645360248, "calib/avg_num_step_conf": 8.15234375, "calib/ece": 0.3344223107569721, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": 0.0013961312026915529, "calib/mean_conf": 0.9878087649402391, "calib/mu_c": 0.9882926829268291, "calib/mu_w": 0.9868965517241376, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3344223107569721, "calib/std_conf": 0.01087685885878777, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8910483870967743, "calib/step_q_c_n": 1364.0, "calib/step_q_gap": 0.011179784053897412, "calib/step_q_w": 0.8798686030428768, "calib/step_q_w_n": 723.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 813.0234375, "completions/mean_terminated_length": 829.2191772460938, "completions/min_length": 0.0, "completions/min_terminated_length": 454.0, "epoch": 0.08426666666666667, "grad_norm": 0.024105433374643326, "kl": 0.0389556884765625, "learning_rate": 3.3611111111111117e-06, "loss": -0.0774, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.017385980114340782, "mask/share_reasoning": 0.859592080116272, "mask/share_step_conf": 0.1034906804561615, "num_tokens": 25094429.0, "reward": 0.8523935079574585, "reward_std": 0.19485636055469513, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6492695212364197, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7312988042831421, "step": 79 }, { "adv/mean_abs_final_conf": 0.48533910512924194, "adv/mean_abs_reasoning": 0.4345003366470337, "adv/mean_abs_step_conf": 0.7359863519668579, "adv/ratio_final_to_reasoning": 1.1170051302480513, "adv/ratio_step_to_reasoning": 1.6938683123846148, "adv/std_final_conf": 0.736708402633667, "adv/std_reasoning": 0.7206791639328003, "adv/std_step_conf": 0.9287194013595581, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4836915957605613, "calib/avg_num_step_conf": 8.76171875, "calib/ece": 0.29513904382470113, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": -0.0012328481862962093, "calib/mean_conf": 0.9883661354581673, "calib/mu_c": 0.9879879310344827, "calib/mu_w": 0.9892207792207789, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29513904382470113, "calib/std_conf": 0.008474936601766516, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8913844816484223, "calib/step_q_c_n": 1553.0, "calib/step_q_gap": 0.00932651063392953, "calib/step_q_w": 0.8820579710144928, "calib/step_q_w_n": 690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 797.9453125, "completions/mean_terminated_length": 807.4071655273438, "completions/min_length": 0.0, "completions/min_terminated_length": 473.0, "epoch": 0.08533333333333333, "grad_norm": 0.026427265256643295, "kl": 0.04227447509765625, "learning_rate": 3.3333333333333333e-06, "loss": -0.0398, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018396034836769104, "mask/share_reasoning": 0.8537461757659912, "mask/share_step_conf": 0.11613905429840088, "num_tokens": 25400863.0, "reward": 0.8714803457260132, "reward_std": 0.24076946079730988, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6858910322189331, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7258197069168091, "step": 80 }, { "adv/mean_abs_final_conf": 0.4667571783065796, "adv/mean_abs_reasoning": 0.38079017400741577, "adv/mean_abs_step_conf": 0.7052954435348511, "adv/ratio_final_to_reasoning": 1.2257595131577361, "adv/ratio_step_to_reasoning": 1.8521891888972841, "adv/std_final_conf": 0.722691535949707, "adv/std_reasoning": 0.7015290856361389, "adv/std_step_conf": 0.9257702827453613, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5446386946386946, "calib/avg_num_step_conf": 8.7265625, "calib/ece": 0.3096707818930041, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0008927738927744233, "calib/mean_conf": 0.9886831275720165, "calib/mu_c": 0.988969696969697, "calib/mu_w": 0.9880769230769226, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3096707818930041, "calib/std_conf": 0.003381504293691219, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8905312954876274, "calib/step_q_c_n": 1374.0, "calib/step_q_gap": 0.03453943502251111, "calib/step_q_w": 0.8559918604651163, "calib/step_q_w_n": 860.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2985.0, "completions/max_terminated_length": 2985.0, "completions/mean_length": 807.69921875, "completions/mean_terminated_length": 837.1295776367188, "completions/min_length": 0.0, "completions/min_terminated_length": 471.0, "epoch": 0.0864, "grad_norm": 0.023954054340720177, "kl": 0.041957855224609375, "learning_rate": 3.3055555555555558e-06, "loss": -0.1408, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.017638327553868294, "mask/share_reasoning": 0.8397368788719177, "mask/share_step_conf": 0.10746851563453674, "num_tokens": 25713882.0, "reward": 0.8331348896026611, "reward_std": 0.22398020327091217, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6516644954681396, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6958553194999695, "step": 81 }, { "adv/mean_abs_final_conf": 0.5066346526145935, "adv/mean_abs_reasoning": 0.42569518089294434, "adv/mean_abs_step_conf": 0.7474675178527832, "adv/ratio_final_to_reasoning": 1.1901348085544905, "adv/ratio_step_to_reasoning": 1.7558749814477217, "adv/std_final_conf": 0.7515454292297363, "adv/std_reasoning": 0.7013655304908752, "adv/std_step_conf": 0.9292082786560059, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5395319855995568, "calib/avg_num_step_conf": 8.40625, "calib/ece": 0.35685943775100404, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9919678714859438, "calib/gap": 0.00501827748546102, "calib/mean_conf": 0.9873815261044177, "calib/mu_c": 0.9892356687898087, "calib/mu_w": 0.9842173913043477, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35685943775100404, "calib/std_conf": 0.01948707274314714, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8947453874538747, "calib/step_q_c_n": 1355.0, "calib/step_q_gap": 0.02878428331334759, "calib/step_q_w": 0.8659611041405271, "calib/step_q_w_n": 797.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2186.0, "completions/max_terminated_length": 2186.0, "completions/mean_length": 776.33984375, "completions/mean_terminated_length": 788.6627197265625, "completions/min_length": 0.0, "completions/min_terminated_length": 430.0, "epoch": 0.08746666666666666, "grad_norm": 0.016892658546566963, "kl": 0.04087066650390625, "learning_rate": 3.277777777777778e-06, "loss": -0.1035, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018185051158070564, "mask/share_reasoning": 0.853351891040802, "mask/share_step_conf": 0.11283804476261139, "num_tokens": 26018177.0, "reward": 0.8254947066307068, "reward_std": 0.19936254620552063, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6241007447242737, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7097011208534241, "step": 82 }, { "adv/mean_abs_final_conf": 0.41412049531936646, "adv/mean_abs_reasoning": 0.26981937885284424, "adv/mean_abs_step_conf": 0.7399859428405762, "adv/ratio_final_to_reasoning": 1.5348063474166622, "adv/ratio_step_to_reasoning": 2.742523335375975, "adv/std_final_conf": 0.6758379936218262, "adv/std_reasoning": 0.5727855563163757, "adv/std_step_conf": 0.9238594770431519, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5109394229454397, "calib/avg_num_step_conf": 8.1640625, "calib/ece": 0.40860816326530613, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9918367346938776, "calib/gap": 0.001752153698892478, "calib/mean_conf": 0.9882, "calib/mu_c": 0.9889366197183097, "calib/mu_w": 0.9871844660194172, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40860816326530613, "calib/std_conf": 0.008615884713002783, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8929226713532513, "calib/step_q_c_n": 1138.0, "calib/step_q_gap": 0.027365948664175743, "calib/step_q_w": 0.8655567226890756, "calib/step_q_w_n": 952.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3038.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 890.20703125, "completions/mean_terminated_length": 897.216552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 485.0, "epoch": 0.08853333333333334, "grad_norm": 0.021719301119446754, "kl": 0.0386505126953125, "learning_rate": 3.2500000000000002e-06, "loss": -0.0612, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.016932372003793716, "mask/share_reasoning": 0.8745453953742981, "mask/share_step_conf": 0.10070972144603729, "num_tokens": 26353334.0, "reward": 0.7632154226303101, "reward_std": 0.13492730259895325, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.564795732498169, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6592912673950195, "step": 83 }, { "adv/mean_abs_final_conf": 0.4269552230834961, "adv/mean_abs_reasoning": 0.35676440596580505, "adv/mean_abs_step_conf": 0.7534521818161011, "adv/ratio_final_to_reasoning": 1.1967427690205694, "adv/ratio_step_to_reasoning": 2.111904016255247, "adv/std_final_conf": 0.689056932926178, "adv/std_reasoning": 0.6404105424880981, "adv/std_step_conf": 0.9300892353057861, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5342475640562974, "calib/avg_num_step_conf": 8.37109375, "calib/ece": 0.33128629032258056, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9959677419354839, "calib/gap": 0.002797185131721669, "calib/mean_conf": 0.9885443548387097, "calib/mu_c": 0.9895030674846625, "calib/mu_w": 0.9867058823529409, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33128629032258056, "calib/std_conf": 0.012289438177264208, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.893919770773639, "calib/step_q_c_n": 1396.0, "calib/step_q_gap": 0.008210266088230722, "calib/step_q_w": 0.8857095046854083, "calib/step_q_w_n": 747.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 775.32421875, "completions/mean_terminated_length": 793.9320678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 464.0, "epoch": 0.0896, "grad_norm": 0.01733134128153324, "kl": 0.04576873779296875, "learning_rate": 3.2222222222222227e-06, "loss": -0.0971, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01821688935160637, "mask/share_reasoning": 0.8478014469146729, "mask/share_step_conf": 0.11054414510726929, "num_tokens": 26657737.0, "reward": 0.8509284257888794, "reward_std": 0.20014074444770813, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6452734470367432, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7354897260665894, "step": 84 }, { "adv/mean_abs_final_conf": 0.5446729063987732, "adv/mean_abs_reasoning": 0.47163212299346924, "adv/mean_abs_step_conf": 0.7699087858200073, "adv/ratio_final_to_reasoning": 1.1548681267546217, "adv/ratio_step_to_reasoning": 1.632435002377199, "adv/std_final_conf": 0.7841033935546875, "adv/std_reasoning": 0.7394751906394958, "adv/std_step_conf": 0.9309518933296204, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.5223388305847078, "calib/avg_num_step_conf": 8.890625, "calib/ece": 0.37514767932489457, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.9873417721518988, "calib/gap": 0.005516491754123276, "calib/mean_conf": 0.9869620253164557, "calib/mu_c": 0.9891034482758622, "calib/mu_w": 0.9835869565217389, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37514767932489457, "calib/std_conf": 0.01857958798167976, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8873767752715122, "calib/step_q_c_n": 1197.0, "calib/step_q_gap": 0.008933772491160008, "calib/step_q_w": 0.8784430027803521, "calib/step_q_w_n": 1079.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 814.48828125, "completions/mean_terminated_length": 851.05712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 487.0, "epoch": 0.09066666666666667, "grad_norm": 0.01905909925699234, "kl": 0.044300079345703125, "learning_rate": 3.1944444444444443e-06, "loss": -0.1392, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.0170908086001873, "mask/share_reasoning": 0.8357868194580078, "mask/share_step_conf": 0.104153573513031, "num_tokens": 26974070.0, "reward": 0.7635809183120728, "reward_std": 0.2349778413772583, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5776484608650208, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.6518570184707642, "step": 85 }, { "adv/mean_abs_final_conf": 0.525831937789917, "adv/mean_abs_reasoning": 0.43927913904190063, "adv/mean_abs_step_conf": 0.7811049222946167, "adv/ratio_final_to_reasoning": 1.1970337105850148, "adv/ratio_step_to_reasoning": 1.7781516417972014, "adv/std_final_conf": 0.751258373260498, "adv/std_reasoning": 0.7014529705047607, "adv/std_step_conf": 0.9302042722702026, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4905166775670373, "calib/avg_num_step_conf": 8.46875, "calib/ece": 0.42939759036144587, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9919678714859438, "calib/gap": 0.0025951602354481462, "calib/mean_conf": 0.9876305220883534, "calib/mu_c": 0.9887769784172662, "calib/mu_w": 0.986181818181818, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42939759036144587, "calib/std_conf": 0.01641602802241287, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8897205757832345, "calib/step_q_c_n": 1181.0, "calib/step_q_gap": 0.007218042855169693, "calib/step_q_w": 0.8825025329280648, "calib/step_q_w_n": 987.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1973.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 849.42578125, "completions/mean_terminated_length": 856.1141967773438, "completions/min_length": 0.0, "completions/min_terminated_length": 444.0, "epoch": 0.09173333333333333, "grad_norm": 0.018472064286470413, "kl": 0.046112060546875, "learning_rate": 3.1666666666666667e-06, "loss": -0.0582, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01759396679699421, "mask/share_reasoning": 0.8653203248977661, "mask/share_step_conf": 0.10927321016788483, "num_tokens": 27297035.0, "reward": 0.754798412322998, "reward_std": 0.2171429842710495, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.55443274974823, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6520390510559082, "step": 86 }, { "adv/mean_abs_final_conf": 0.5290802121162415, "adv/mean_abs_reasoning": 0.44977080821990967, "adv/mean_abs_step_conf": 0.7352198362350464, "adv/ratio_final_to_reasoning": 1.1763329287870423, "adv/ratio_step_to_reasoning": 1.6346544124214706, "adv/std_final_conf": 0.769879937171936, "adv/std_reasoning": 0.7393056750297546, "adv/std_step_conf": 0.9318410158157349, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5486928104575164, "calib/avg_num_step_conf": 8.63671875, "calib/ece": 0.2612096774193548, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9879032258064516, "calib/gap": 0.004196078431372374, "calib/mean_conf": 0.9870161290322582, "calib/mu_c": 0.9881666666666666, "calib/mu_w": 0.9839705882352943, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2612096774193548, "calib/std_conf": 0.020179830185260015, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8923865110246433, "calib/step_q_c_n": 1542.0, "calib/step_q_gap": 0.009021937033611871, "calib/step_q_w": 0.8833645739910314, "calib/step_q_w_n": 669.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3017.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 773.546875, "completions/mean_terminated_length": 795.2931518554688, "completions/min_length": 0.0, "completions/min_terminated_length": 430.0, "epoch": 0.0928, "grad_norm": 0.019448574632406235, "kl": 0.04779815673828125, "learning_rate": 3.138888888888889e-06, "loss": -0.0334, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01838276907801628, "mask/share_reasoning": 0.840671956539154, "mask/share_step_conf": 0.11360155045986176, "num_tokens": 27600559.0, "reward": 0.9018452167510986, "reward_std": 0.2351362109184265, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7070223093032837, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7638556361198425, "step": 87 }, { "adv/mean_abs_final_conf": 0.45828092098236084, "adv/mean_abs_reasoning": 0.39458200335502625, "adv/mean_abs_step_conf": 0.7401527762413025, "adv/ratio_final_to_reasoning": 1.1614339150942505, "adv/ratio_step_to_reasoning": 1.875789493560222, "adv/std_final_conf": 0.7244842648506165, "adv/std_reasoning": 0.6816993355751038, "adv/std_step_conf": 0.9296145439147949, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5261466549810109, "calib/avg_num_step_conf": 8.265625, "calib/ece": 0.32947368421052625, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0005229330996203707, "calib/mean_conf": 0.9893927125506072, "calib/mu_c": 0.9895705521472392, "calib/mu_w": 0.9890476190476188, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32947368421052625, "calib/std_conf": 0.0023883208427129777, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8906811377245512, "calib/step_q_c_n": 1336.0, "calib/step_q_gap": 0.00586062490403827, "calib/step_q_w": 0.8848205128205129, "calib/step_q_w_n": 780.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2735.0, "completions/max_terminated_length": 2735.0, "completions/mean_length": 803.74609375, "completions/mean_terminated_length": 823.0360107421875, "completions/min_length": 0.0, "completions/min_terminated_length": 423.0, "epoch": 0.09386666666666667, "grad_norm": 0.017657170072197914, "kl": 0.04468536376953125, "learning_rate": 3.1111111111111116e-06, "loss": -0.1192, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017656173557043076, "mask/share_reasoning": 0.854995846748352, "mask/share_step_conf": 0.10391046106815338, "num_tokens": 27916166.0, "reward": 0.8516751527786255, "reward_std": 0.21041029691696167, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6437921524047852, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7392456531524658, "step": 88 }, { "adv/mean_abs_final_conf": 0.5156100392341614, "adv/mean_abs_reasoning": 0.4167075455188751, "adv/mean_abs_step_conf": 0.7418398857116699, "adv/ratio_final_to_reasoning": 1.237342699403571, "adv/ratio_step_to_reasoning": 1.780241067600413, "adv/std_final_conf": 0.763513445854187, "adv/std_reasoning": 0.7014049291610718, "adv/std_step_conf": 0.928699791431427, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5216002747252748, "calib/avg_num_step_conf": 8.4453125, "calib/ece": 0.41422131147540997, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9918032786885246, "calib/gap": 0.0003543956043953189, "calib/mean_conf": 0.9879918032786886, "calib/mu_c": 0.988142857142857, "calib/mu_w": 0.9877884615384617, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41422131147540997, "calib/std_conf": 0.009124813855401486, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8837217099748533, "calib/step_q_c_n": 1193.0, "calib/step_q_gap": 0.010646374577536477, "calib/step_q_w": 0.8730753353973169, "calib/step_q_w_n": 969.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2964.0, "completions/max_terminated_length": 2964.0, "completions/mean_length": 859.01171875, "completions/mean_terminated_length": 883.1605834960938, "completions/min_length": 0.0, "completions/min_terminated_length": 427.0, "epoch": 0.09493333333333333, "grad_norm": 0.025277912616729736, "kl": 0.044895172119140625, "learning_rate": 3.0833333333333336e-06, "loss": -0.0676, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.016975190490484238, "mask/share_reasoning": 0.8520951271057129, "mask/share_step_conf": 0.10358592122793198, "num_tokens": 28244961.0, "reward": 0.7633959650993347, "reward_std": 0.19842402637004852, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5565800666809082, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6702117919921875, "step": 89 }, { "adv/mean_abs_final_conf": 0.5149222612380981, "adv/mean_abs_reasoning": 0.4274728298187256, "adv/mean_abs_step_conf": 0.7600886821746826, "adv/ratio_final_to_reasoning": 1.204573075337809, "adv/ratio_step_to_reasoning": 1.7780982302360744, "adv/std_final_conf": 0.7509579658508301, "adv/std_reasoning": 0.7014234066009521, "adv/std_step_conf": 0.9276285171508789, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5043859649122807, "calib/avg_num_step_conf": 9.07421875, "calib/ece": 0.29223200000000016, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.992, "calib/gap": 0.00067362371445856, "calib/mean_conf": 0.9882320000000001, "calib/mu_c": 0.9884367816091953, "calib/mu_w": 0.9877631578947368, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29223200000000016, "calib/std_conf": 0.008574507332785948, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8846670918367348, "calib/step_q_c_n": 1568.0, "calib/step_q_gap": 0.0035809991215028925, "calib/step_q_w": 0.8810860927152319, "calib/step_q_w_n": 755.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2763.0, "completions/max_terminated_length": 2763.0, "completions/mean_length": 829.625, "completions/mean_terminated_length": 842.793701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 399.0, "epoch": 0.096, "grad_norm": 0.01628192327916622, "kl": 0.04920196533203125, "learning_rate": 3.055555555555556e-06, "loss": -0.0194, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.017995685338974, "mask/share_reasoning": 0.8512020111083984, "mask/share_step_conf": 0.11517727375030518, "num_tokens": 28560665.0, "reward": 0.8831427693367004, "reward_std": 0.22143936157226562, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.6867460608482361, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7475081086158752, "step": 90 }, { "adv/mean_abs_final_conf": 0.5486507415771484, "adv/mean_abs_reasoning": 0.4548686742782593, "adv/mean_abs_step_conf": 0.7140244245529175, "adv/ratio_final_to_reasoning": 1.2061739411879555, "adv/ratio_step_to_reasoning": 1.569737519704695, "adv/std_final_conf": 0.79872065782547, "adv/std_reasoning": 0.7394763231277466, "adv/std_step_conf": 0.9286575317382812, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5275906183368869, "calib/avg_num_step_conf": 8.875, "calib/ece": 0.26574380165247946, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.987603305785124, "calib/gap": 0.0005057569290661812, "calib/mean_conf": 0.9872314049582647, "calib/mu_c": 0.9873714285708571, "calib/mu_w": 0.986865671641791, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.26491735537148775, "calib/std_conf": 0.01499931697064191, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.887391186440678, "calib/step_q_c_n": 1475.0, "calib/step_q_gap": 0.02412267953979974, "calib/step_q_w": 0.8632685069008783, "calib/step_q_w_n": 797.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3070.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 850.0078125, "completions/mean_terminated_length": 873.903564453125, "completions/min_length": 0.0, "completions/min_terminated_length": 528.0, "epoch": 0.09706666666666666, "grad_norm": 0.01996271312236786, "kl": 0.048755645751953125, "learning_rate": 3.0277777777777776e-06, "loss": -0.1626, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.016846606507897377, "mask/share_reasoning": 0.8526731133460999, "mask/share_step_conf": 0.10313647985458374, "num_tokens": 28885979.0, "reward": 0.8746767640113831, "reward_std": 0.24369308352470398, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.6861194968223572, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7382339835166931, "step": 91 }, { "adv/mean_abs_final_conf": 0.537227988243103, "adv/mean_abs_reasoning": 0.3748323321342468, "adv/mean_abs_step_conf": 0.7305430173873901, "adv/ratio_final_to_reasoning": 1.433248794692273, "adv/ratio_step_to_reasoning": 1.9489861326203444, "adv/std_final_conf": 0.7719976902008057, "adv/std_reasoning": 0.661332905292511, "adv/std_step_conf": 0.9242905974388123, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5755091723241369, "calib/avg_num_step_conf": 9.01171875, "calib/ece": 0.33553846153846156, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9757085020242915, "calib/gap": 0.008520511339015324, "calib/mean_conf": 0.9792631578947368, "calib/mu_c": 0.9822298136645964, "calib/mu_w": 0.9737093023255811, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33148987854251016, "calib/std_conf": 0.08567871193649285, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8842680339462518, "calib/step_q_c_n": 1414.0, "calib/step_q_gap": 0.02231618624188436, "calib/step_q_w": 0.8619518477043674, "calib/step_q_w_n": 893.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2816.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 805.90625, "completions/mean_terminated_length": 818.698486328125, "completions/min_length": 0.0, "completions/min_terminated_length": 434.0, "epoch": 0.09813333333333334, "grad_norm": 0.022326858714222908, "kl": 0.0508575439453125, "learning_rate": 3e-06, "loss": -0.0959, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01844978705048561, "mask/share_reasoning": 0.8456693291664124, "mask/share_step_conf": 0.12025590240955353, "num_tokens": 29199011.0, "reward": 0.8462902307510376, "reward_std": 0.18844886124134064, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.639072597026825, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.734757661819458, "step": 92 }, { "adv/mean_abs_final_conf": 0.5825698375701904, "adv/mean_abs_reasoning": 0.4962431788444519, "adv/mean_abs_step_conf": 0.7400534152984619, "adv/ratio_final_to_reasoning": 1.1739603936254763, "adv/ratio_step_to_reasoning": 1.491312015656809, "adv/std_final_conf": 0.8054587244987488, "adv/std_reasoning": 0.7754384875297546, "adv/std_step_conf": 0.9276998043060303, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5200179600389134, "calib/avg_num_step_conf": 9.0859375, "calib/ece": 0.32852459016393454, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9959016393442623, "calib/gap": 0.0013499962583252145, "calib/mean_conf": 0.988360655737705, "calib/mu_c": 0.9888198757763974, "calib/mu_w": 0.9874698795180722, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32852459016393454, "calib/std_conf": 0.006573750051069972, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8862597864768684, "calib/step_q_c_n": 1405.0, "calib/step_q_gap": 0.015358592122905357, "calib/step_q_w": 0.8709011943539631, "calib/step_q_w_n": 921.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2945.0, "completions/max_terminated_length": 2945.0, "completions/mean_length": 818.04296875, "completions/mean_terminated_length": 844.431396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 410.0, "epoch": 0.0992, "grad_norm": 6.2837347984313965, "kl": 3.0508956909179688, "learning_rate": 2.9722222222222225e-06, "loss": -0.0992, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017331324517726898, "mask/share_reasoning": 0.8419540524482727, "mask/share_step_conf": 0.1094646155834198, "num_tokens": 29514206.0, "reward": 0.8271012902259827, "reward_std": 0.2480258047580719, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6368609666824341, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7009353637695312, "step": 93 }, { "adv/mean_abs_final_conf": 0.5216161608695984, "adv/mean_abs_reasoning": 0.43027183413505554, "adv/mean_abs_step_conf": 0.7488489747047424, "adv/ratio_final_to_reasoning": 1.212294460124646, "adv/ratio_step_to_reasoning": 1.7404090049493934, "adv/std_final_conf": 0.7995529770851135, "adv/std_reasoning": 0.7396001815795898, "adv/std_step_conf": 0.9273629784584045, "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.5579310344827586, "calib/avg_num_step_conf": 9.29296875, "calib/ece": 0.36985957446808526, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.9787234042553191, "calib/gap": 0.0030482758620687944, "calib/mean_conf": 0.98688085106383, "calib/mu_c": 0.9880482758620689, "calib/mu_w": 0.9850000000000001, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36985957446808526, "calib/std_conf": 0.013457365906745298, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8902324455205811, "calib/step_q_c_n": 1239.0, "calib/step_q_gap": 0.03742542797672144, "calib/step_q_w": 0.8528070175438597, "calib/step_q_w_n": 1140.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2935.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 828.6015625, "completions/mean_terminated_length": 869.3524169921875, "completions/min_length": 0.0, "completions/min_terminated_length": 465.0, "epoch": 0.10026666666666667, "grad_norm": 0.0222991481423378, "kl": 0.054683685302734375, "learning_rate": 2.944444444444445e-06, "loss": -0.1839, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.016976788640022278, "mask/share_reasoning": 0.8307620286941528, "mask/share_step_conf": 0.10538618266582489, "num_tokens": 29835008.0, "reward": 0.767528235912323, "reward_std": 0.2512366771697998, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5765511393547058, "rewards/format_reward_step": 0.9140625, "rewards/step_l2_reward": 0.6624114513397217, "step": 94 }, { "adv/mean_abs_final_conf": 0.527319073677063, "adv/mean_abs_reasoning": 0.3920542001724243, "adv/mean_abs_step_conf": 0.7362141609191895, "adv/ratio_final_to_reasoning": 1.3450157489580512, "adv/ratio_step_to_reasoning": 1.8778377086520297, "adv/std_final_conf": 0.7650555968284607, "adv/std_reasoning": 0.6817232370376587, "adv/std_step_conf": 0.9281631112098694, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5381760339342524, "calib/avg_num_step_conf": 8.74609375, "calib/ece": 0.3236625514403294, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9794238683127572, "calib/gap": 0.004409180427207837, "calib/mean_conf": 0.9862139917695475, "calib/mu_c": 0.9877018633540372, "calib/mu_w": 0.9832926829268294, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3236625514403294, "calib/std_conf": 0.014560343067339289, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8967602591792656, "calib/step_q_c_n": 1389.0, "calib/step_q_gap": 0.050474376826324496, "calib/step_q_w": 0.8462858823529411, "calib/step_q_w_n": 850.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2133.0, "completions/max_terminated_length": 2133.0, "completions/mean_length": 782.41015625, "completions/mean_terminated_length": 797.9960327148438, "completions/min_length": 0.0, "completions/min_terminated_length": 417.0, "epoch": 0.10133333333333333, "grad_norm": 0.0220172256231308, "kl": 0.0579681396484375, "learning_rate": 2.916666666666667e-06, "loss": -0.0616, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.0183150302618742, "mask/share_reasoning": 0.8456125259399414, "mask/share_step_conf": 0.11654117703437805, "num_tokens": 30141433.0, "reward": 0.8251522779464722, "reward_std": 0.2152584344148636, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6392277479171753, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6954517364501953, "step": 95 }, { "adv/mean_abs_final_conf": 0.5503877997398376, "adv/mean_abs_reasoning": 0.3534834384918213, "adv/mean_abs_step_conf": 0.7321844100952148, "adv/ratio_final_to_reasoning": 1.5570398491316368, "adv/ratio_step_to_reasoning": 2.071340069620138, "adv/std_final_conf": 0.7819110155105591, "adv/std_reasoning": 0.6613574624061584, "adv/std_step_conf": 0.9222843050956726, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5541159880133968, "calib/avg_num_step_conf": 8.9453125, "calib/ece": 0.23044129554655873, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9838056680161943, "calib/gap": 0.016938039837828267, "calib/mean_conf": 0.9822631578947368, "calib/mu_c": 0.9864462365591397, "calib/mu_w": 0.9695081967213114, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22983400809716603, "calib/std_conf": 0.06409430977468505, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8916709102482495, "calib/step_q_c_n": 1571.0, "calib/step_q_gap": 0.011142398426274513, "calib/step_q_w": 0.880528511821975, "calib/step_q_w_n": 719.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2074.0, "completions/max_terminated_length": 2074.0, "completions/mean_length": 743.26171875, "completions/mean_terminated_length": 767.2378540039062, "completions/min_length": 0.0, "completions/min_terminated_length": 422.0, "epoch": 0.1024, "grad_norm": 0.020050369203090668, "kl": 0.062469482421875, "learning_rate": 2.888888888888889e-06, "loss": -0.148, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01858597621321678, "mask/share_reasoning": 0.8389642238616943, "mask/share_step_conf": 0.1111997589468956, "num_tokens": 30437524.0, "reward": 0.9249247312545776, "reward_std": 0.19512362778186798, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7367488145828247, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7756006121635437, "step": 96 }, { "adv/mean_abs_final_conf": 0.5968998074531555, "adv/mean_abs_reasoning": 0.5177270770072937, "adv/mean_abs_step_conf": 0.7378278970718384, "adv/ratio_final_to_reasoning": 1.1529236811478307, "adv/ratio_step_to_reasoning": 1.4251290493377922, "adv/std_final_conf": 0.8343783617019653, "adv/std_reasoning": 0.7928089499473572, "adv/std_step_conf": 0.9322978854179382, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4990490278951817, "calib/avg_num_step_conf": 8.9140625, "calib/ece": 0.3497813765182187, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.97165991902834, "calib/gap": 0.01189743589743597, "calib/mean_conf": 0.9813603238866397, "calib/mu_c": 0.9857435897435897, "calib/mu_w": 0.9738461538461537, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3497813765182187, "calib/std_conf": 0.06528024082990255, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8857207407407407, "calib/step_q_c_n": 1350.0, "calib/step_q_gap": 0.007405290096964001, "calib/step_q_w": 0.8783154506437767, "calib/step_q_w_n": 932.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2562.0, "completions/max_terminated_length": 2562.0, "completions/mean_length": 770.0859375, "completions/mean_terminated_length": 791.7349243164062, "completions/min_length": 0.0, "completions/min_terminated_length": 422.0, "epoch": 0.10346666666666667, "grad_norm": 0.021622126922011375, "kl": 0.0661468505859375, "learning_rate": 2.861111111111111e-06, "loss": -0.1201, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018322018906474113, "mask/share_reasoning": 0.8374599814414978, "mask/share_step_conf": 0.11687424778938293, "num_tokens": 30739738.0, "reward": 0.8113015294075012, "reward_std": 0.2594345211982727, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6235218048095703, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6842373609542847, "step": 97 }, { "adv/mean_abs_final_conf": 0.5326055288314819, "adv/mean_abs_reasoning": 0.4282838702201843, "adv/mean_abs_step_conf": 0.7558221817016602, "adv/ratio_final_to_reasoning": 1.2435806386023012, "adv/ratio_step_to_reasoning": 1.7647691969185897, "adv/std_final_conf": 0.7673627734184265, "adv/std_reasoning": 0.7014768123626709, "adv/std_step_conf": 0.9310451149940491, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5135707344162154, "calib/avg_num_step_conf": 8.60546875, "calib/ece": 0.3856204081632655, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9795918367346939, "calib/gap": 0.0012210884353739404, "calib/mean_conf": 0.9856204081632655, "calib/mu_c": 0.9861088435374148, "calib/mu_w": 0.9848877551020409, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3856204081632655, "calib/std_conf": 0.014391848310505025, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8865980629539952, "calib/step_q_c_n": 1239.0, "calib/step_q_gap": 0.0038791832859453867, "calib/step_q_w": 0.8827188796680498, "calib/step_q_w_n": 964.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 808.16015625, "completions/mean_terminated_length": 834.2297973632812, "completions/min_length": 0.0, "completions/min_terminated_length": 442.0, "epoch": 0.10453333333333334, "grad_norm": 0.016606997698545456, "kl": 0.0679779052734375, "learning_rate": 2.8333333333333335e-06, "loss": -0.0813, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01783130317926407, "mask/share_reasoning": 0.8402774930000305, "mask/share_step_conf": 0.1106412261724472, "num_tokens": 31052811.0, "reward": 0.7877363562583923, "reward_std": 0.22248929738998413, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.585392951965332, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6838297843933105, "step": 98 }, { "adv/mean_abs_final_conf": 0.6818572282791138, "adv/mean_abs_reasoning": 0.5733513236045837, "adv/mean_abs_step_conf": 0.760155200958252, "adv/ratio_final_to_reasoning": 1.1892485465846103, "adv/ratio_step_to_reasoning": 1.3258104929090544, "adv/std_final_conf": 0.8731258511543274, "adv/std_reasoning": 0.8100834488868713, "adv/std_step_conf": 0.93084317445755, "calib/answer_extract_rate": 0.8984375, "calib/auroc": 0.5508882866606444, "calib/avg_num_step_conf": 9.7890625, "calib/ece": 0.5188268398268399, "calib/final_conf_rate": 0.90234375, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.9783549783549783, "calib/gap": -0.0005745257452575148, "calib/mean_conf": 0.9841948051948052, "calib/mu_c": 0.9838888888888888, "calib/mu_w": 0.9844634146341463, "calib/nonempty_final_conf_rate": 0.90234375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5177445887445887, "calib/std_conf": 0.02099229231528474, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8921641791044775, "calib/step_q_c_n": 938.0, "calib/step_q_gap": 0.008407163798354933, "calib/step_q_w": 0.8837570153061226, "calib/step_q_w_n": 1568.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2717.0, "completions/max_terminated_length": 2717.0, "completions/mean_length": 807.765625, "completions/mean_terminated_length": 865.2217407226562, "completions/min_length": 0.0, "completions/min_terminated_length": 396.0, "epoch": 0.1056, "grad_norm": 0.028910474851727486, "kl": 0.06258392333984375, "learning_rate": 2.805555555555556e-06, "loss": -0.2888, "mask/has_final_conf_rate": 0.90234375, "mask/share_final_conf": 0.016516810283064842, "mask/share_reasoning": 0.8145354986190796, "mask/share_step_conf": 0.10254141688346863, "num_tokens": 31365399.0, "reward": 0.6236565113067627, "reward_std": 0.2722751200199127, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.4360261559486389, "rewards/format_reward_step": 0.89453125, "rewards/step_l2_reward": 0.5480057001113892, "step": 99 }, { "adv/mean_abs_final_conf": 0.5936969518661499, "adv/mean_abs_reasoning": 0.4137655794620514, "adv/mean_abs_step_conf": 0.7432897686958313, "adv/ratio_final_to_reasoning": 1.434863075459376, "adv/ratio_step_to_reasoning": 1.7964030977690408, "adv/std_final_conf": 0.8098013401031494, "adv/std_reasoning": 0.7014086246490479, "adv/std_step_conf": 0.9335170984268188, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5957593330916999, "calib/avg_num_step_conf": 9.30859375, "calib/ece": 0.34782377049180346, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9631147540983607, "calib/gap": 0.006088075389633518, "calib/mean_conf": 0.9830696721311477, "calib/mu_c": 0.985290322580645, "calib/mu_w": 0.9792022471910115, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34782377049180346, "calib/std_conf": 0.018483689705748783, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8866344005956813, "calib/step_q_c_n": 1343.0, "calib/step_q_gap": -0.004342522481241895, "calib/step_q_w": 0.8909769230769232, "calib/step_q_w_n": 1040.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2693.0, "completions/max_terminated_length": 2693.0, "completions/mean_length": 797.84765625, "completions/mean_terminated_length": 826.9190673828125, "completions/min_length": 0.0, "completions/min_terminated_length": 412.0, "epoch": 0.10666666666666667, "grad_norm": 0.023713137954473495, "kl": 0.06809234619140625, "learning_rate": 2.7777777777777783e-06, "loss": -0.1413, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01752588339149952, "mask/share_reasoning": 0.8371009826660156, "mask/share_step_conf": 0.11021691560745239, "num_tokens": 31677056.0, "reward": 0.8176565170288086, "reward_std": 0.2034880518913269, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.619330883026123, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7042633891105652, "step": 100 }, { "adv/mean_abs_final_conf": 0.701155424118042, "adv/mean_abs_reasoning": 0.5449827909469604, "adv/mean_abs_step_conf": 0.7575080990791321, "adv/ratio_final_to_reasoning": 1.2865643388476844, "adv/ratio_step_to_reasoning": 1.3899670075139223, "adv/std_final_conf": 0.8829103112220764, "adv/std_reasoning": 0.7928749918937683, "adv/std_step_conf": 0.932668924331665, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.49327324973876696, "calib/avg_num_step_conf": 8.59375, "calib/ece": 0.515241935483871, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9717741935483871, "calib/gap": -0.0023667711598743812, "calib/mean_conf": 0.9829838709677421, "calib/mu_c": 0.9817241379310345, "calib/mu_w": 0.9840909090909089, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.515241935483871, "calib/std_conf": 0.02107895444512084, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8749799599198397, "calib/step_q_c_n": 998.0, "calib/step_q_gap": -0.00976472486478086, "calib/step_q_w": 0.8847446847846205, "calib/step_q_w_n": 1202.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2546.0, "completions/max_terminated_length": 2546.0, "completions/mean_length": 858.80859375, "completions/mean_terminated_length": 875.9163818359375, "completions/min_length": 0.0, "completions/min_terminated_length": 426.0, "epoch": 0.10773333333333333, "grad_norm": 0.031017888337373734, "kl": 0.07028961181640625, "learning_rate": 2.7500000000000004e-06, "loss": -0.0571, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.017236188054084778, "mask/share_reasoning": 0.8573594093322754, "mask/share_step_conf": 0.1058732271194458, "num_tokens": 32003903.0, "reward": 0.6754347085952759, "reward_std": 0.2761334776878357, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.4688202738761902, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.5976740717887878, "step": 101 }, { "adv/mean_abs_final_conf": 0.6097878813743591, "adv/mean_abs_reasoning": 0.32521986961364746, "adv/mean_abs_step_conf": 0.7349889874458313, "adv/ratio_final_to_reasoning": 1.8750019243866338, "adv/ratio_step_to_reasoning": 2.2599756537599585, "adv/std_final_conf": 0.8272060751914978, "adv/std_reasoning": 0.6403584480285645, "adv/std_step_conf": 0.9255380630493164, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5119204329384233, "calib/avg_num_step_conf": 9.3984375, "calib/ece": 0.33240408163265317, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9551020408163265, "calib/gap": -0.0018097118619277985, "calib/mean_conf": 0.9813836734693878, "calib/mu_c": 0.9807484276729559, "calib/mu_w": 0.9825581395348837, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33240408163265317, "calib/std_conf": 0.02426822350770464, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8855560026827632, "calib/step_q_c_n": 1491.0, "calib/step_q_gap": 0.006915565524293288, "calib/step_q_w": 0.87864043715847, "calib/step_q_w_n": 915.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 752.43359375, "completions/mean_terminated_length": 773.5863037109375, "completions/min_length": 0.0, "completions/min_terminated_length": 380.0, "epoch": 0.1088, "grad_norm": 0.031540971249341965, "kl": 0.0884246826171875, "learning_rate": 2.7222222222222224e-06, "loss": -0.1251, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01865723729133606, "mask/share_reasoning": 0.8307026028633118, "mask/share_step_conf": 0.12329642474651337, "num_tokens": 32303222.0, "reward": 0.8231228590011597, "reward_std": 0.17860785126686096, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6319171786308289, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6987035274505615, "step": 102 }, { "adv/mean_abs_final_conf": 0.5567460060119629, "adv/mean_abs_reasoning": 0.3415173292160034, "adv/mean_abs_step_conf": 0.7269463539123535, "adv/ratio_final_to_reasoning": 1.63021304743172, "adv/ratio_step_to_reasoning": 2.12857823519864, "adv/std_final_conf": 0.7949619889259338, "adv/std_reasoning": 0.6612794995307922, "adv/std_step_conf": 0.9273793697357178, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5904428409734013, "calib/avg_num_step_conf": 8.5625, "calib/ece": 0.35893877551020414, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9224489795918367, "calib/gap": 0.010909734012450545, "calib/mean_conf": 0.9773061224489797, "calib/mu_c": 0.9814473684210526, "calib/mu_w": 0.9705376344086021, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3579183673469389, "calib/std_conf": 0.029569844044251104, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8903230890464934, "calib/step_q_c_n": 1269.0, "calib/step_q_gap": 0.019304670844976668, "calib/step_q_w": 0.8710184182015167, "calib/step_q_w_n": 923.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 844.40625, "completions/mean_terminated_length": 868.14453125, "completions/min_length": 0.0, "completions/min_terminated_length": 341.0, "epoch": 0.10986666666666667, "grad_norm": 0.02460324950516224, "kl": 0.06529998779296875, "learning_rate": 2.6944444444444444e-06, "loss": -0.1313, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.017562564462423325, "mask/share_reasoning": 0.8493900299072266, "mask/share_step_conf": 0.10570365190505981, "num_tokens": 32623942.0, "reward": 0.812166154384613, "reward_std": 0.17410849034786224, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6138265132904053, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7003495693206787, "step": 103 }, { "adv/mean_abs_final_conf": 0.6447658538818359, "adv/mean_abs_reasoning": 0.45572638511657715, "adv/mean_abs_step_conf": 0.7367246747016907, "adv/ratio_final_to_reasoning": 1.4148091375418201, "adv/ratio_step_to_reasoning": 1.6165942959682547, "adv/std_final_conf": 0.8271310925483704, "adv/std_reasoning": 0.7392915487289429, "adv/std_step_conf": 0.9331515431404114, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4921793534932221, "calib/avg_num_step_conf": 9.33984375, "calib/ece": 0.42979919678714884, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9558232931726908, "calib/gap": 0.004381517205422458, "calib/mean_conf": 0.9800000000000002, "calib/mu_c": 0.9819708029197083, "calib/mu_w": 0.9775892857142858, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42979919678714884, "calib/std_conf": 0.02084557873052321, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8894085173501577, "calib/step_q_c_n": 1268.0, "calib/step_q_gap": 0.013023833467699975, "calib/step_q_w": 0.8763846838824577, "calib/step_q_w_n": 1123.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2769.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 790.4609375, "completions/mean_terminated_length": 806.2072143554688, "completions/min_length": 0.0, "completions/min_terminated_length": 408.0, "epoch": 0.11093333333333333, "grad_norm": 0.02433106303215027, "kl": 0.0781707763671875, "learning_rate": 2.666666666666667e-06, "loss": -0.0497, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01839319057762623, "mask/share_reasoning": 0.8394039869308472, "mask/share_step_conf": 0.12267155945301056, "num_tokens": 32932980.0, "reward": 0.7579681873321533, "reward_std": 0.2207770049571991, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5539538860321045, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6604199409484863, "step": 104 }, { "adv/mean_abs_final_conf": 0.7216190695762634, "adv/mean_abs_reasoning": 0.5883845090866089, "adv/mean_abs_step_conf": 0.7586150169372559, "adv/ratio_final_to_reasoning": 1.226441312495606, "adv/ratio_step_to_reasoning": 1.2893184732461904, "adv/std_final_conf": 0.8924412131309509, "adv/std_reasoning": 0.8267587423324585, "adv/std_step_conf": 0.9350519180297852, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.582585658777817, "calib/avg_num_step_conf": 9.62890625, "calib/ece": 0.34719262295081976, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9098360655737705, "calib/gap": 0.036468032497350866, "calib/mean_conf": 0.9578483606557378, "calib/mu_c": 0.9720469798657717, "calib/mu_w": 0.9355789473684208, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34719262295081976, "calib/std_conf": 0.10714046604586741, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8859402654867257, "calib/step_q_c_n": 1356.0, "calib/step_q_gap": 0.04099977856156789, "calib/step_q_w": 0.8449404869251578, "calib/step_q_w_n": 1109.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 839.49609375, "completions/mean_terminated_length": 866.5765991210938, "completions/min_length": 0.0, "completions/min_terminated_length": 450.0, "epoch": 0.112, "grad_norm": 0.02181658335030079, "kl": 0.07389068603515625, "learning_rate": 2.6388888888888893e-06, "loss": -0.1146, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01731002703309059, "mask/share_reasoning": 0.838695764541626, "mask/share_step_conf": 0.1127442717552185, "num_tokens": 33253651.0, "reward": 0.8030605316162109, "reward_std": 0.31802046298980713, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.617209255695343, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6818804740905762, "step": 105 }, { "adv/mean_abs_final_conf": 0.6939954161643982, "adv/mean_abs_reasoning": 0.4305272102355957, "adv/mean_abs_step_conf": 0.7503472566604614, "adv/ratio_final_to_reasoning": 1.6119664440828858, "adv/ratio_step_to_reasoning": 1.7428567552091583, "adv/std_final_conf": 0.8775339722633362, "adv/std_reasoning": 0.7013723254203796, "adv/std_step_conf": 0.9299072623252869, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5432639271924986, "calib/avg_num_step_conf": 8.765625, "calib/ece": 0.3660121951219513, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.8861788617886179, "calib/gap": 0.013204771097628298, "calib/mean_conf": 0.9676382113821139, "calib/mu_c": 0.9728986486486487, "calib/mu_w": 0.9596938775510204, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3660121951219513, "calib/std_conf": 0.04002889204595677, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.889991889699919, "calib/step_q_c_n": 1233.0, "calib/step_q_gap": 0.020328190392302825, "calib/step_q_w": 0.8696636993076162, "calib/step_q_w_n": 1011.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2921.0, "completions/max_terminated_length": 2921.0, "completions/mean_length": 760.08203125, "completions/mean_terminated_length": 784.6007690429688, "completions/min_length": 0.0, "completions/min_terminated_length": 410.0, "epoch": 0.11306666666666666, "grad_norm": 0.02546885423362255, "kl": 0.0869293212890625, "learning_rate": 2.6111111111111113e-06, "loss": -0.0712, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018718719482421875, "mask/share_reasoning": 0.835868239402771, "mask/share_step_conf": 0.11416309326887131, "num_tokens": 33552816.0, "reward": 0.7969148755073547, "reward_std": 0.19696387648582458, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6064382791519165, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.679578959941864, "step": 106 }, { "adv/mean_abs_final_conf": 0.6982734799385071, "adv/mean_abs_reasoning": 0.5172214508056641, "adv/mean_abs_step_conf": 0.7512728571891785, "adv/ratio_final_to_reasoning": 1.3500474097716217, "adv/ratio_step_to_reasoning": 1.4525168204430383, "adv/std_final_conf": 0.8845649361610413, "adv/std_reasoning": 0.7755434513092041, "adv/std_step_conf": 0.9340380430221558, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5586052322163434, "calib/avg_num_step_conf": 9.26953125, "calib/ece": 0.3012449799196789, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9437751004016064, "calib/gap": 0.013053350970017474, "calib/mean_conf": 0.974979919678715, "calib/mu_c": 0.9792261904761904, "calib/mu_w": 0.9661728395061729, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30076305220883554, "calib/std_conf": 0.04145084181948238, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8911875843454791, "calib/step_q_c_n": 1482.0, "calib/step_q_gap": 0.0021303452882401874, "calib/step_q_w": 0.8890572390572389, "calib/step_q_w_n": 891.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2458.0, "completions/max_terminated_length": 2458.0, "completions/mean_length": 757.328125, "completions/mean_terminated_length": 775.5040283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 440.0, "epoch": 0.11413333333333334, "grad_norm": 0.018556181341409683, "kl": 0.08465576171875, "learning_rate": 2.5833333333333337e-06, "loss": -0.1297, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01875002309679985, "mask/share_reasoning": 0.8351476192474365, "mask/share_step_conf": 0.12266480922698975, "num_tokens": 33851308.0, "reward": 0.8634672164916992, "reward_std": 0.2831929624080658, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6712472438812256, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7322496175765991, "step": 107 }, { "adv/mean_abs_final_conf": 0.6516192555427551, "adv/mean_abs_reasoning": 0.37682825326919556, "adv/mean_abs_step_conf": 0.7557332515716553, "adv/ratio_final_to_reasoning": 1.7292208052066005, "adv/ratio_step_to_reasoning": 2.0055111181692116, "adv/std_final_conf": 0.879697859287262, "adv/std_reasoning": 0.6613324284553528, "adv/std_step_conf": 0.9278945922851562, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6354972375690608, "calib/avg_num_step_conf": 9.08984375, "calib/ece": 0.21908298755186717, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.8672199170124482, "calib/gap": 0.024716390423572765, "calib/mean_conf": 0.9638962655601662, "calib/mu_c": 0.9700497237569061, "calib/mu_w": 0.9453333333333334, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21597095435684643, "calib/std_conf": 0.0629978069380198, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8831625766871165, "calib/step_q_c_n": 1630.0, "calib/step_q_gap": 0.014439477691420621, "calib/step_q_w": 0.8687230989956959, "calib/step_q_w_n": 697.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2792.0, "completions/max_terminated_length": 2792.0, "completions/mean_length": 815.3203125, "completions/mean_terminated_length": 841.6209716796875, "completions/min_length": 0.0, "completions/min_terminated_length": 405.0, "epoch": 0.1152, "grad_norm": 0.03747807815670967, "kl": 0.081787109375, "learning_rate": 2.5555555555555557e-06, "loss": -0.1129, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01746443659067154, "mask/share_reasoning": 0.8368449211120605, "mask/share_step_conf": 0.11444061994552612, "num_tokens": 34163262.0, "reward": 0.9091541767120361, "reward_std": 0.18528440594673157, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7276929616928101, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.7601466774940491, "step": 108 }, { "adv/mean_abs_final_conf": 0.6833958625793457, "adv/mean_abs_reasoning": 0.43323150277137756, "adv/mean_abs_step_conf": 0.7291561365127563, "adv/ratio_final_to_reasoning": 1.5774380630394356, "adv/ratio_step_to_reasoning": 1.6830635164994971, "adv/std_final_conf": 0.879310667514801, "adv/std_reasoning": 0.7205653786659241, "adv/std_step_conf": 0.9332891702651978, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6254714439655172, "calib/avg_num_step_conf": 9.68359375, "calib/ece": 0.43293852459016396, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.8442622950819673, "calib/gap": 0.032313038793103654, "calib/mean_conf": 0.9575286885245904, "calib/mu_c": 0.9728906250000002, "calib/mu_w": 0.9405775862068966, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43293852459016396, "calib/std_conf": 0.05920507562909826, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8847472150814053, "calib/step_q_c_n": 1167.0, "calib/step_q_gap": 0.0233790748375029, "calib/step_q_w": 0.8613681402439024, "calib/step_q_w_n": 1312.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2984.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 840.765625, "completions/mean_terminated_length": 860.9440307617188, "completions/min_length": 0.0, "completions/min_terminated_length": 483.0, "epoch": 0.11626666666666667, "grad_norm": 0.03594356030225754, "kl": 0.0813751220703125, "learning_rate": 2.5277777777777778e-06, "loss": -0.0475, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017205409705638885, "mask/share_reasoning": 0.8475127220153809, "mask/share_step_conf": 0.11184436082839966, "num_tokens": 34483098.0, "reward": 0.7405705451965332, "reward_std": 0.23001110553741455, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5487913489341736, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6409435272216797, "step": 109 }, { "adv/mean_abs_final_conf": 0.7198961973190308, "adv/mean_abs_reasoning": 0.47418078780174255, "adv/mean_abs_step_conf": 0.7653753757476807, "adv/ratio_final_to_reasoning": 1.518189297918209, "adv/ratio_step_to_reasoning": 1.614100350408309, "adv/std_final_conf": 0.8994455337524414, "adv/std_reasoning": 0.7393791675567627, "adv/std_step_conf": 0.9331598281860352, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5265456989247311, "calib/avg_num_step_conf": 9.43359375, "calib/ece": 0.35470119521912363, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8884462151394422, "calib/gap": 0.004077284946236603, "calib/mean_conf": 0.9682470119521913, "calib/mu_c": 0.9698064516129034, "calib/mu_w": 0.9657291666666667, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3527091633466137, "calib/std_conf": 0.046478744028319606, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8881674842326559, "calib/step_q_c_n": 1427.0, "calib/step_q_gap": 0.010019710953303673, "calib/step_q_w": 0.8781477732793522, "calib/step_q_w_n": 988.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2747.0, "completions/max_terminated_length": 2747.0, "completions/mean_length": 755.0390625, "completions/mean_terminated_length": 763.9921264648438, "completions/min_length": 0.0, "completions/min_terminated_length": 306.0, "epoch": 0.11733333333333333, "grad_norm": 0.020071037113666534, "kl": 0.09243011474609375, "learning_rate": 2.5e-06, "loss": -0.0492, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019367707893252373, "mask/share_reasoning": 0.8397272825241089, "mask/share_step_conf": 0.1291862428188324, "num_tokens": 34781308.0, "reward": 0.8190223574638367, "reward_std": 0.2533964216709137, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6280652284622192, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6927918791770935, "step": 110 }, { "adv/mean_abs_final_conf": 0.712738037109375, "adv/mean_abs_reasoning": 0.506923496723175, "adv/mean_abs_step_conf": 0.7473180294036865, "adv/ratio_final_to_reasoning": 1.4060071030769221, "adv/ratio_step_to_reasoning": 1.4742225093815056, "adv/std_final_conf": 0.9178887605667114, "adv/std_reasoning": 0.7756215333938599, "adv/std_step_conf": 0.9329038262367249, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.614922206506365, "calib/avg_num_step_conf": 9.5625, "calib/ece": 0.37982987551867237, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.8008298755186722, "calib/gap": 0.021407355021216268, "calib/mean_conf": 0.9607427385892118, "calib/mu_c": 0.9697142857142856, "calib/mu_w": 0.9483069306930694, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37982987551867237, "calib/std_conf": 0.05120085133630464, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8879902755267425, "calib/step_q_c_n": 1234.0, "calib/step_q_gap": 0.039388957569576055, "calib/step_q_w": 0.8486013179571664, "calib/step_q_w_n": 1214.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2456.0, "completions/max_terminated_length": 2456.0, "completions/mean_length": 791.375, "completions/mean_terminated_length": 816.9031982421875, "completions/min_length": 0.0, "completions/min_terminated_length": 360.0, "epoch": 0.1184, "grad_norm": 0.031903259456157684, "kl": 0.0840606689453125, "learning_rate": 2.4722222222222226e-06, "loss": -0.1046, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018131183460354805, "mask/share_reasoning": 0.8329363465309143, "mask/share_step_conf": 0.11768247932195663, "num_tokens": 35091308.0, "reward": 0.7689356803894043, "reward_std": 0.2724992632865906, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5790972709655762, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.6626803278923035, "step": 111 }, { "adv/mean_abs_final_conf": 0.7234814167022705, "adv/mean_abs_reasoning": 0.458984911441803, "adv/mean_abs_step_conf": 0.7522188425064087, "adv/ratio_final_to_reasoning": 1.5762640528413197, "adv/ratio_step_to_reasoning": 1.6388748818419194, "adv/std_final_conf": 0.8879250884056091, "adv/std_reasoning": 0.7393825650215149, "adv/std_step_conf": 0.9295467138290405, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6773631479599246, "calib/avg_num_step_conf": 9.70703125, "calib/ece": 0.359121338912134, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.7364016736401674, "calib/gap": 0.03483229272542465, "calib/mean_conf": 0.9438912133891215, "calib/mu_c": 0.9580281690140846, "calib/mu_w": 0.92319587628866, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35443514644351476, "calib/std_conf": 0.08277309741101296, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8867267032106498, "calib/step_q_c_n": 1277.0, "calib/step_q_gap": 0.02705782903846443, "calib/step_q_w": 0.8596688741721854, "calib/step_q_w_n": 1208.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3048.0, "completions/max_terminated_length": 3048.0, "completions/mean_length": 826.0078125, "completions/mean_terminated_length": 856.1052856445312, "completions/min_length": 0.0, "completions/min_terminated_length": 489.0, "epoch": 0.11946666666666667, "grad_norm": 0.040970560163259506, "kl": 0.0791778564453125, "learning_rate": 2.4444444444444447e-06, "loss": -0.1348, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.017311958596110344, "mask/share_reasoning": 0.8367998600006104, "mask/share_step_conf": 0.11073192209005356, "num_tokens": 35410686.0, "reward": 0.7834147214889526, "reward_std": 0.21543002128601074, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6035550832748413, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.6656181216239929, "step": 112 }, { "adv/mean_abs_final_conf": 0.7123876810073853, "adv/mean_abs_reasoning": 0.5197149515151978, "adv/mean_abs_step_conf": 0.7662708163261414, "adv/ratio_final_to_reasoning": 1.3707277016573445, "adv/ratio_step_to_reasoning": 1.4744059490536587, "adv/std_final_conf": 0.8939816355705261, "adv/std_reasoning": 0.7754083871841431, "adv/std_step_conf": 0.9343150854110718, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5899899564780717, "calib/avg_num_step_conf": 10.49609375, "calib/ece": 0.3649193548387099, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7983870967741935, "calib/gap": 0.037502510880482065, "calib/mean_conf": 0.9495967741935485, "calib/mu_c": 0.9651724137931036, "calib/mu_w": 0.9276699029126215, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3649193548387099, "calib/std_conf": 0.09619284388872504, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8905063291139242, "calib/step_q_c_n": 1343.0, "calib/step_q_gap": 0.02546168625678147, "calib/step_q_w": 0.8650446428571428, "calib/step_q_w_n": 1344.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2983.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 777.46875, "completions/mean_terminated_length": 792.9561767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 417.0, "epoch": 0.12053333333333334, "grad_norm": 0.034543026238679886, "kl": 0.09393310546875, "learning_rate": 2.4166666666666667e-06, "loss": -0.0128, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01877540722489357, "mask/share_reasoning": 0.833499550819397, "mask/share_step_conf": 0.12819385528564453, "num_tokens": 35714918.0, "reward": 0.8004428148269653, "reward_std": 0.25604891777038574, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6131843328475952, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.680669903755188, "step": 113 }, { "adv/mean_abs_final_conf": 0.7102067470550537, "adv/mean_abs_reasoning": 0.4649397134780884, "adv/mean_abs_step_conf": 0.7417643666267395, "adv/ratio_final_to_reasoning": 1.5275243788967583, "adv/ratio_step_to_reasoning": 1.5953990272799041, "adv/std_final_conf": 0.9010103344917297, "adv/std_reasoning": 0.7395108342170715, "adv/std_step_conf": 0.9281858205795288, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.6205092287409362, "calib/avg_num_step_conf": 10.296875, "calib/ece": 0.26420168067226923, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.7857142857142857, "calib/gap": 0.04142880685563621, "calib/mean_conf": 0.9532773109243698, "calib/mu_c": 0.966158536585366, "calib/mu_w": 0.9247297297297298, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26420168067226923, "calib/std_conf": 0.0815769909447045, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8923828382838284, "calib/step_q_c_n": 1515.0, "calib/step_q_gap": 0.03360183917588899, "calib/step_q_w": 0.8587809991079394, "calib/step_q_w_n": 1121.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2915.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 782.15625, "completions/mean_terminated_length": 820.6229248046875, "completions/min_length": 0.0, "completions/min_terminated_length": 388.0, "epoch": 0.1216, "grad_norm": 0.05427626147866249, "kl": 0.07926177978515625, "learning_rate": 2.388888888888889e-06, "loss": -0.1523, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.018315622583031654, "mask/share_reasoning": 0.8093851804733276, "mask/share_step_conf": 0.12542423605918884, "num_tokens": 36020174.0, "reward": 0.8564561009407043, "reward_std": 0.24829316139221191, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6759241819381714, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7229254245758057, "step": 114 }, { "adv/mean_abs_final_conf": 0.7221448421478271, "adv/mean_abs_reasoning": 0.5368207693099976, "adv/mean_abs_step_conf": 0.7720561027526855, "adv/ratio_final_to_reasoning": 1.3452252286662378, "adv/ratio_step_to_reasoning": 1.4382008798673116, "adv/std_final_conf": 0.9052088856697083, "adv/std_reasoning": 0.792934775352478, "adv/std_step_conf": 0.9331409335136414, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5650809716599191, "calib/avg_num_step_conf": 9.82421875, "calib/ece": 0.4268852459016395, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.8278688524590164, "calib/gap": 0.0170013495276653, "calib/mean_conf": 0.9596721311475411, "calib/mu_c": 0.9676153846153848, "calib/mu_w": 0.9506140350877195, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4268852459016395, "calib/std_conf": 0.05791365005481284, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.889716494845361, "calib/step_q_c_n": 1164.0, "calib/step_q_gap": 0.019390810167344674, "calib/step_q_w": 0.8703256846780163, "calib/step_q_w_n": 1351.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2484.0, "completions/max_terminated_length": 2484.0, "completions/mean_length": 783.9375, "completions/mean_terminated_length": 815.8048706054688, "completions/min_length": 0.0, "completions/min_terminated_length": 420.0, "epoch": 0.12266666666666666, "grad_norm": 0.03652571141719818, "kl": 0.07973480224609375, "learning_rate": 2.361111111111111e-06, "loss": -0.1839, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017669178545475006, "mask/share_reasoning": 0.8261470794677734, "mask/share_step_conf": 0.11712120473384857, "num_tokens": 36326126.0, "reward": 0.7415735721588135, "reward_std": 0.26013612747192383, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5470499992370605, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6439095735549927, "step": 115 }, { "adv/mean_abs_final_conf": 0.7074733972549438, "adv/mean_abs_reasoning": 0.4718044102191925, "adv/mean_abs_step_conf": 0.7679315805435181, "adv/ratio_final_to_reasoning": 1.4995056890762497, "adv/ratio_step_to_reasoning": 1.627648160784995, "adv/std_final_conf": 0.8698757290840149, "adv/std_reasoning": 0.7208515405654907, "adv/std_step_conf": 0.9321540594100952, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.49127605068199126, "calib/avg_num_step_conf": 9.60546875, "calib/ece": 0.3838114754098362, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.8934426229508197, "calib/gap": 0.00012808973205025165, "calib/mean_conf": 0.969877049180328, "calib/mu_c": 0.96993006993007, "calib/mu_w": 0.9698019801980198, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3838114754098362, "calib/std_conf": 0.0437343091991714, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8948684210526318, "calib/step_q_c_n": 1368.0, "calib/step_q_gap": 0.028828091080129514, "calib/step_q_w": 0.8660403299725022, "calib/step_q_w_n": 1091.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 815.3984375, "completions/mean_terminated_length": 841.7015991210938, "completions/min_length": 0.0, "completions/min_terminated_length": 391.0, "epoch": 0.12373333333333333, "grad_norm": 0.03004090115427971, "kl": 0.07421112060546875, "learning_rate": 2.3333333333333336e-06, "loss": -0.1704, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01729145646095276, "mask/share_reasoning": 0.8337869644165039, "mask/share_step_conf": 0.11767159402370453, "num_tokens": 36639388.0, "reward": 0.7805377244949341, "reward_std": 0.25898727774620056, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5796562433242798, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6798566579818726, "step": 116 }, { "adv/mean_abs_final_conf": 0.6913862228393555, "adv/mean_abs_reasoning": 0.5055603384971619, "adv/mean_abs_step_conf": 0.780232310295105, "adv/ratio_final_to_reasoning": 1.3675642058761632, "adv/ratio_step_to_reasoning": 1.5433020569106315, "adv/std_final_conf": 0.865074634552002, "adv/std_reasoning": 0.7577515840530396, "adv/std_step_conf": 0.934880256652832, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6497433535140829, "calib/avg_num_step_conf": 9.4296875, "calib/ece": 0.4290283400809717, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8663967611336032, "calib/gap": 0.03380165833113968, "calib/mean_conf": 0.9593927125506073, "calib/mu_c": 0.9752671755725191, "calib/mu_w": 0.9414655172413794, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4290283400809717, "calib/std_conf": 0.08955601026861996, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8945716709075489, "calib/step_q_c_n": 1179.0, "calib/step_q_gap": 0.018442116251678242, "calib/step_q_w": 0.8761295546558706, "calib/step_q_w_n": 1235.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2718.0, "completions/max_terminated_length": 2718.0, "completions/mean_length": 789.875, "completions/mean_terminated_length": 808.83203125, "completions/min_length": 0.0, "completions/min_terminated_length": 416.0, "epoch": 0.1248, "grad_norm": 0.03556828573346138, "kl": 0.07696533203125, "learning_rate": 2.305555555555556e-06, "loss": -0.0637, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01829782873392105, "mask/share_reasoning": 0.8373548984527588, "mask/share_step_conf": 0.12090975791215897, "num_tokens": 36948196.0, "reward": 0.7476822137832642, "reward_std": 0.26584988832473755, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5554363131523132, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6446155905723572, "step": 117 }, { "adv/mean_abs_final_conf": 0.625870943069458, "adv/mean_abs_reasoning": 0.3719392418861389, "adv/mean_abs_step_conf": 0.7527309656143188, "adv/ratio_final_to_reasoning": 1.6827236080162113, "adv/ratio_step_to_reasoning": 2.023800881555679, "adv/std_final_conf": 0.846591055393219, "adv/std_reasoning": 0.6613606214523315, "adv/std_step_conf": 0.9336501955986023, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6724489795918368, "calib/avg_num_step_conf": 10.08203125, "calib/ece": 0.35510080645161307, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8266129032258065, "calib/gap": 0.040221088435373975, "calib/mean_conf": 0.9599395161290325, "calib/mu_c": 0.9758333333333333, "calib/mu_w": 0.9356122448979594, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.35510080645161307, "calib/std_conf": 0.08253845860053945, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8710762483130904, "calib/step_q_c_n": 1482.0, "calib/step_q_gap": 0.03379690345412767, "calib/step_q_w": 0.8372793448589627, "calib/step_q_w_n": 1099.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2620.0, "completions/max_terminated_length": 2620.0, "completions/mean_length": 815.83984375, "completions/mean_terminated_length": 832.0916748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 430.0, "epoch": 0.12586666666666665, "grad_norm": 0.029230020940303802, "kl": 0.07103729248046875, "learning_rate": 2.277777777777778e-06, "loss": -0.1422, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018248315900564194, "mask/share_reasoning": 0.8373794555664062, "mask/share_step_conf": 0.12484100461006165, "num_tokens": 37261059.0, "reward": 0.8076679110527039, "reward_std": 0.19901418685913086, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6259959936141968, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6791835427284241, "step": 118 }, { "adv/mean_abs_final_conf": 0.6673562526702881, "adv/mean_abs_reasoning": 0.5177932977676392, "adv/mean_abs_step_conf": 0.7421085834503174, "adv/ratio_final_to_reasoning": 1.2888468343399948, "adv/ratio_step_to_reasoning": 1.433213961342814, "adv/std_final_conf": 0.8697852492332458, "adv/std_reasoning": 0.792900025844574, "adv/std_step_conf": 0.9320849776268005, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.6407605552342395, "calib/avg_num_step_conf": 8.83984375, "calib/ece": 0.39105485232067516, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.810126582278481, "calib/gap": 0.03139965297860037, "calib/mean_conf": 0.9522362869198314, "calib/mu_c": 0.9660150375939851, "calib/mu_w": 0.9346153846153847, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39105485232067516, "calib/std_conf": 0.08091029485832527, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.885243578387954, "calib/step_q_c_n": 1129.0, "calib/step_q_gap": 0.07308308456079349, "calib/step_q_w": 0.8121604938271605, "calib/step_q_w_n": 1134.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 853.38671875, "completions/mean_terminated_length": 877.3775024414062, "completions/min_length": 0.0, "completions/min_terminated_length": 443.0, "epoch": 0.12693333333333334, "grad_norm": 0.03167301416397095, "kl": 0.070526123046875, "learning_rate": 2.25e-06, "loss": -0.0566, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.017237205058336258, "mask/share_reasoning": 0.846515417098999, "mask/share_step_conf": 0.10890362411737442, "num_tokens": 37584590.0, "reward": 0.7539892792701721, "reward_std": 0.266414999961853, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5644835829734802, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.6544324159622192, "step": 119 }, { "adv/mean_abs_final_conf": 0.6957762241363525, "adv/mean_abs_reasoning": 0.5754411220550537, "adv/mean_abs_step_conf": 0.7391219139099121, "adv/ratio_final_to_reasoning": 1.209118009591581, "adv/ratio_step_to_reasoning": 1.2844440301212932, "adv/std_final_conf": 0.8899182081222534, "adv/std_reasoning": 0.8429538011550903, "adv/std_step_conf": 0.9355378746986389, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6211656441717792, "calib/avg_num_step_conf": 9.72265625, "calib/ece": 0.27576763485477174, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.8008298755186722, "calib/gap": 0.05051753971999384, "calib/mean_conf": 0.9521161825726142, "calib/mu_c": 0.9684662576687117, "calib/mu_w": 0.9179487179487179, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27576763485477174, "calib/std_conf": 0.07791954428304795, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8885578231292517, "calib/step_q_c_n": 1470.0, "calib/step_q_gap": 0.04782180742758335, "calib/step_q_w": 0.8407360157016683, "calib/step_q_w_n": 1019.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2620.0, "completions/max_terminated_length": 2620.0, "completions/mean_length": 816.375, "completions/mean_terminated_length": 846.1214599609375, "completions/min_length": 0.0, "completions/min_terminated_length": 472.0, "epoch": 0.128, "grad_norm": 0.034337036311626434, "kl": 0.07117462158203125, "learning_rate": 2.222222222222222e-06, "loss": -0.1318, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01761537417769432, "mask/share_reasoning": 0.832998514175415, "mask/share_step_conf": 0.11422987282276154, "num_tokens": 37900270.0, "reward": 0.8556807637214661, "reward_std": 0.29194173216819763, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6786898374557495, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7178279161453247, "step": 120 }, { "adv/mean_abs_final_conf": 0.7192496061325073, "adv/mean_abs_reasoning": 0.5807757377624512, "adv/mean_abs_step_conf": 0.7497106790542603, "adv/ratio_final_to_reasoning": 1.2384291549498143, "adv/ratio_step_to_reasoning": 1.2908780968410682, "adv/std_final_conf": 0.9046310782432556, "adv/std_reasoning": 0.8265811800956726, "adv/std_step_conf": 0.9339022636413574, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5918091809180919, "calib/avg_num_step_conf": 9.27734375, "calib/ece": 0.36618852459016404, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7704918032786885, "calib/gap": 0.024492833898774657, "calib/mean_conf": 0.9440573770491805, "calib/mu_c": 0.9541958041958043, "calib/mu_w": 0.9297029702970296, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3620901639344263, "calib/std_conf": 0.10597871725564956, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8727382875605815, "calib/step_q_c_n": 1238.0, "calib/step_q_gap": 0.0370126938930353, "calib/step_q_w": 0.8357255936675462, "calib/step_q_w_n": 1137.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2797.0, "completions/max_terminated_length": 2797.0, "completions/mean_length": 835.12109375, "completions/mean_terminated_length": 865.5506591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 443.0, "epoch": 0.12906666666666666, "grad_norm": 0.03736064210534096, "kl": 0.069427490234375, "learning_rate": 2.1944444444444445e-06, "loss": -0.1096, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017177481204271317, "mask/share_reasoning": 0.8382337093353271, "mask/share_step_conf": 0.10943257063627243, "num_tokens": 38219117.0, "reward": 0.7833296060562134, "reward_std": 0.27490270137786865, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5992906093597412, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6658061742782593, "step": 121 }, { "adv/mean_abs_final_conf": 0.6296117901802063, "adv/mean_abs_reasoning": 0.4547215402126312, "adv/mean_abs_step_conf": 0.7533493041992188, "adv/ratio_final_to_reasoning": 1.3846095566218286, "adv/ratio_step_to_reasoning": 1.6567266724311036, "adv/std_final_conf": 0.8445394039154053, "adv/std_reasoning": 0.7206957936286926, "adv/std_step_conf": 0.9326170682907104, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6663586635127143, "calib/avg_num_step_conf": 9.625, "calib/ece": 0.32323651452282165, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.8008298755186722, "calib/gap": 0.06292652276759325, "calib/mean_conf": 0.9456431535269711, "calib/mu_c": 0.9688815789473685, "calib/mu_w": 0.9059550561797752, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31908713692946067, "calib/std_conf": 0.11622208871556197, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.88203611898017, "calib/step_q_c_n": 1412.0, "calib/step_q_gap": 0.04434600491172891, "calib/step_q_w": 0.8376901140684411, "calib/step_q_w_n": 1052.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2919.0, "completions/max_terminated_length": 2919.0, "completions/mean_length": 809.8984375, "completions/mean_terminated_length": 836.024169921875, "completions/min_length": 0.0, "completions/min_terminated_length": 465.0, "epoch": 0.13013333333333332, "grad_norm": 0.034300535917282104, "kl": 0.07027435302734375, "learning_rate": 2.166666666666667e-06, "loss": -0.0697, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017662089318037033, "mask/share_reasoning": 0.8331197500228882, "mask/share_step_conf": 0.11796815693378448, "num_tokens": 38533795.0, "reward": 0.8321857452392578, "reward_std": 0.2278578281402588, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6436429619789124, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.7136971950531006, "step": 122 }, { "adv/mean_abs_final_conf": 0.6647226810455322, "adv/mean_abs_reasoning": 0.47725650668144226, "adv/mean_abs_step_conf": 0.748039960861206, "adv/ratio_final_to_reasoning": 1.392799619784376, "adv/ratio_step_to_reasoning": 1.5673750915679092, "adv/std_final_conf": 0.8729745745658875, "adv/std_reasoning": 0.7577446699142456, "adv/std_step_conf": 0.9337034225463867, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5375591615956727, "calib/avg_num_step_conf": 8.8984375, "calib/ece": 0.3648987854251011, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7611336032388664, "calib/gap": 0.007183231913455268, "calib/mean_conf": 0.944412955465587, "calib/mu_c": 0.9473793103448276, "calib/mu_w": 0.9401960784313723, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36113360323886634, "calib/std_conf": 0.0868403971029498, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8690188383045526, "calib/step_q_c_n": 1274.0, "calib/step_q_gap": 0.010562663005747819, "calib/step_q_w": 0.8584561752988048, "calib/step_q_w_n": 1004.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2915.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 899.86328125, "completions/mean_terminated_length": 899.86328125, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.1312, "grad_norm": 0.026797449216246605, "kl": 0.0637359619140625, "learning_rate": 2.138888888888889e-06, "loss": 0.0474, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017092389985919, "mask/share_reasoning": 0.8718984723091125, "mask/share_step_conf": 0.1110091581940651, "num_tokens": 38869448.0, "reward": 0.7914524078369141, "reward_std": 0.2560619115829468, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6038050651550293, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6728497743606567, "step": 123 }, { "adv/mean_abs_final_conf": 0.6712331771850586, "adv/mean_abs_reasoning": 0.43861687183380127, "adv/mean_abs_step_conf": 0.7770863771438599, "adv/ratio_final_to_reasoning": 1.5303405324531139, "adv/ratio_step_to_reasoning": 1.7716746140999107, "adv/std_final_conf": 0.8607048392295837, "adv/std_reasoning": 0.7207204699516296, "adv/std_step_conf": 0.9323819875717163, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6848791951237425, "calib/avg_num_step_conf": 10.140625, "calib/ece": 0.30999999999999994, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.7107438016528925, "calib/gap": 0.04948373356833369, "calib/mean_conf": 0.9389256198347109, "calib/mu_c": 0.9571241830065359, "calib/mu_w": 0.9076404494382022, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3083471074380164, "calib/std_conf": 0.08353058184853406, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8742908827785818, "calib/step_q_c_n": 1382.0, "calib/step_q_gap": 0.06927440831400189, "calib/step_q_w": 0.8050164744645799, "calib/step_q_w_n": 1214.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2449.0, "completions/max_terminated_length": 2449.0, "completions/mean_length": 820.33203125, "completions/mean_terminated_length": 846.7943115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 393.0, "epoch": 0.13226666666666667, "grad_norm": 0.04688045009970665, "kl": 0.0717620849609375, "learning_rate": 2.1111111111111114e-06, "loss": -0.1779, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.01713072881102562, "mask/share_reasoning": 0.8390701413154602, "mask/share_step_conf": 0.11254914104938507, "num_tokens": 39186269.0, "reward": 0.8271832466125488, "reward_std": 0.2311488538980484, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6517531275749207, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.6940196752548218, "step": 124 }, { "adv/mean_abs_final_conf": 0.7343705296516418, "adv/mean_abs_reasoning": 0.5540212392807007, "adv/mean_abs_step_conf": 0.7735514044761658, "adv/ratio_final_to_reasoning": 1.3255277552266644, "adv/ratio_step_to_reasoning": 1.396248644691829, "adv/std_final_conf": 0.9081211090087891, "adv/std_reasoning": 0.7930463552474976, "adv/std_step_conf": 0.9331655502319336, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.6197039842873177, "calib/avg_num_step_conf": 10.15234375, "calib/ece": 0.3788333333333334, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6791666666666667, "calib/gap": 0.051077441077441166, "calib/mean_conf": 0.9238333333333334, "calib/mu_c": 0.9468181818181819, "calib/mu_w": 0.8957407407407407, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3763333333333334, "calib/std_conf": 0.12223054264608152, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8688908145580589, "calib/step_q_c_n": 1154.0, "calib/step_q_gap": 0.036157250544218034, "calib/step_q_w": 0.8327335640138409, "calib/step_q_w_n": 1445.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2610.0, "completions/max_terminated_length": 2610.0, "completions/mean_length": 844.04296875, "completions/mean_terminated_length": 885.55322265625, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.13333333333333333, "grad_norm": 0.036926452070474625, "kl": 0.064727783203125, "learning_rate": 2.0833333333333334e-06, "loss": -0.1825, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.016466014087200165, "mask/share_reasoning": 0.8317209482192993, "mask/share_step_conf": 0.10493804514408112, "num_tokens": 39507152.0, "reward": 0.7655426263809204, "reward_std": 0.2716066837310791, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5841484069824219, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.6563118696212769, "step": 125 }, { "adv/mean_abs_final_conf": 0.6951326131820679, "adv/mean_abs_reasoning": 0.46520280838012695, "adv/mean_abs_step_conf": 0.7669442296028137, "adv/ratio_final_to_reasoning": 1.4942571297077392, "adv/ratio_step_to_reasoning": 1.6486233870198985, "adv/std_final_conf": 0.8844326138496399, "adv/std_reasoning": 0.7206853628158569, "adv/std_step_conf": 0.9299399852752686, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6635110104959868, "calib/avg_num_step_conf": 9.2421875, "calib/ece": 0.36739669421487586, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6198347107438017, "calib/gap": 0.08087192152020317, "calib/mean_conf": 0.9004545454545455, "calib/mu_c": 0.9382170542635659, "calib/mu_w": 0.8573451327433628, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36739669421487586, "calib/std_conf": 0.14376935730249876, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8738846480067853, "calib/step_q_c_n": 1179.0, "calib/step_q_gap": 0.09074648456954859, "calib/step_q_w": 0.7831381634372367, "calib/step_q_w_n": 1187.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3049.0, "completions/max_terminated_length": 3049.0, "completions/mean_length": 849.1796875, "completions/mean_terminated_length": 876.5725708007812, "completions/min_length": 0.0, "completions/min_terminated_length": 435.0, "epoch": 0.1344, "grad_norm": 0.044366173446178436, "kl": 0.06201934814453125, "learning_rate": 2.0555555555555555e-06, "loss": -0.1284, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.017499610781669617, "mask/share_reasoning": 0.8362378478050232, "mask/share_step_conf": 0.115012526512146, "num_tokens": 39830006.0, "reward": 0.7772961258888245, "reward_std": 0.23498398065567017, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6009371280670166, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.6638113260269165, "step": 126 }, { "adv/mean_abs_final_conf": 0.6320117712020874, "adv/mean_abs_reasoning": 0.43845459818840027, "adv/mean_abs_step_conf": 0.7619009613990784, "adv/ratio_final_to_reasoning": 1.4414531716930865, "adv/ratio_step_to_reasoning": 1.7376963647937291, "adv/std_final_conf": 0.8390277028083801, "adv/std_reasoning": 0.7206178903579712, "adv/std_step_conf": 0.9327162504196167, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.72906454248366, "calib/avg_num_step_conf": 9.70703125, "calib/ece": 0.3370901639344262, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6557377049180327, "calib/gap": 0.1311111111111113, "calib/mean_conf": 0.8944672131147541, "calib/mu_c": 0.9525000000000001, "calib/mu_w": 0.8213888888888888, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3370901639344262, "calib/std_conf": 0.1769925294536388, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8744261006289308, "calib/step_q_c_n": 1272.0, "calib/step_q_gap": 0.07998257218705107, "calib/step_q_w": 0.7944435284418797, "calib/step_q_w_n": 1213.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2990.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 803.41015625, "completions/mean_terminated_length": 832.6842041015625, "completions/min_length": 0.0, "completions/min_terminated_length": 430.0, "epoch": 0.13546666666666668, "grad_norm": 0.039158936589956284, "kl": 0.06972503662109375, "learning_rate": 2.027777777777778e-06, "loss": -0.1332, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017885904759168625, "mask/share_reasoning": 0.828772783279419, "mask/share_step_conf": 0.11818505823612213, "num_tokens": 40139351.0, "reward": 0.8012007474899292, "reward_std": 0.21756742894649506, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6414800882339478, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6640462875366211, "step": 127 }, { "adv/mean_abs_final_conf": 0.7376161813735962, "adv/mean_abs_reasoning": 0.6009324789047241, "adv/mean_abs_step_conf": 0.7559952139854431, "adv/ratio_final_to_reasoning": 1.2274526794058387, "adv/ratio_step_to_reasoning": 1.2580368685735552, "adv/std_final_conf": 0.9184370040893555, "adv/std_reasoning": 0.8267238140106201, "adv/std_step_conf": 0.9349108934402466, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.6201575366988901, "calib/avg_num_step_conf": 9.3828125, "calib/ece": 0.3379411764705882, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5882352941176471, "calib/gap": 0.06202005012531342, "calib/mean_conf": 0.8854201680672269, "calib/mu_c": 0.9127819548872181, "calib/mu_w": 0.8507619047619047, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33226890756302513, "calib/std_conf": 0.1614054342799455, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8687878787878787, "calib/step_q_c_n": 1188.0, "calib/step_q_gap": 0.06706629723927893, "calib/step_q_w": 0.8017215815485997, "calib/step_q_w_n": 1214.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 856.234375, "completions/mean_terminated_length": 891.0405883789062, "completions/min_length": 0.0, "completions/min_terminated_length": 335.0, "epoch": 0.13653333333333334, "grad_norm": 0.04041381925344467, "kl": 0.066192626953125, "learning_rate": 2.0000000000000003e-06, "loss": -0.1361, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.017022691667079926, "mask/share_reasoning": 0.8357027769088745, "mask/share_step_conf": 0.10821200907230377, "num_tokens": 40465211.0, "reward": 0.7812422513961792, "reward_std": 0.30478304624557495, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6055277585983276, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.6671130657196045, "step": 128 }, { "adv/mean_abs_final_conf": 0.6258381605148315, "adv/mean_abs_reasoning": 0.3574847877025604, "adv/mean_abs_step_conf": 0.7670409679412842, "adv/ratio_final_to_reasoning": 1.7506707475215708, "adv/ratio_step_to_reasoning": 2.14566044298223, "adv/std_final_conf": 0.8416181206703186, "adv/std_reasoning": 0.6404501795768738, "adv/std_step_conf": 0.9318187236785889, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6007423117709437, "calib/avg_num_step_conf": 9.50390625, "calib/ece": 0.24427983539094644, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6419753086419753, "calib/gap": 0.06941221027117139, "calib/mean_conf": 0.8911111111111112, "calib/mu_c": 0.9145341614906833, "calib/mu_w": 0.845121951219512, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23641975308641971, "calib/std_conf": 0.1703059459512129, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.864235924932976, "calib/step_q_c_n": 1492.0, "calib/step_q_gap": 0.018688634816078964, "calib/step_q_w": 0.845547290116897, "calib/step_q_w_n": 941.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2888.0, "completions/max_terminated_length": 2888.0, "completions/mean_length": 815.046875, "completions/mean_terminated_length": 831.2828979492188, "completions/min_length": 0.0, "completions/min_terminated_length": 418.0, "epoch": 0.1376, "grad_norm": 0.053492747247219086, "kl": 0.070281982421875, "learning_rate": 1.9722222222222224e-06, "loss": -0.066, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.018132753670215607, "mask/share_reasoning": 0.8407728672027588, "mask/share_step_conf": 0.1215631514787674, "num_tokens": 40776247.0, "reward": 0.8621791005134583, "reward_std": 0.18524247407913208, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6893390417098999, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7193941473960876, "step": 129 }, { "adv/mean_abs_final_conf": 0.6381160020828247, "adv/mean_abs_reasoning": 0.3434726595878601, "adv/mean_abs_step_conf": 0.7246782779693604, "adv/ratio_final_to_reasoning": 1.8578363787339383, "adv/ratio_step_to_reasoning": 2.109857241152518, "adv/std_final_conf": 0.8502756357192993, "adv/std_reasoning": 0.6401984691619873, "adv/std_step_conf": 0.9270868301391602, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6616675839295543, "calib/avg_num_step_conf": 9.28125, "calib/ece": 0.28260000000000013, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.716, "calib/gap": 0.11069345074298276, "calib/mean_conf": 0.8914800000000002, "calib/mu_c": 0.9322151898734177, "calib/mu_w": 0.8215217391304349, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2710400000000001, "calib/std_conf": 0.20137082608958032, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8756263577118031, "calib/step_q_c_n": 1381.0, "calib/step_q_gap": 0.00785751349069752, "calib/step_q_w": 0.8677688442211056, "calib/step_q_w_n": 995.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1719.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 740.9140625, "completions/mean_terminated_length": 758.696044921875, "completions/min_length": 0.0, "completions/min_terminated_length": 428.0, "epoch": 0.13866666666666666, "grad_norm": 0.03951597586274147, "kl": 0.06847381591796875, "learning_rate": 1.944444444444445e-06, "loss": -0.1244, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019231311976909637, "mask/share_reasoning": 0.8349332213401794, "mask/share_step_conf": 0.12239795178174973, "num_tokens": 41071209.0, "reward": 0.8673378229141235, "reward_std": 0.18943777680397034, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6826508045196533, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7356186509132385, "step": 130 }, { "adv/mean_abs_final_conf": 0.6092958450317383, "adv/mean_abs_reasoning": 0.4230729341506958, "adv/mean_abs_step_conf": 0.7680660486221313, "adv/ratio_final_to_reasoning": 1.4401673939621746, "adv/ratio_step_to_reasoning": 1.815445958895946, "adv/std_final_conf": 0.8299340605735779, "adv/std_reasoning": 0.7013399004936218, "adv/std_step_conf": 0.9331469535827637, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6704498125780926, "calib/avg_num_step_conf": 9.55859375, "calib/ece": 0.4599999999999999, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5795918367346938, "calib/gap": 0.11802721088435364, "calib/mean_conf": 0.86, "calib/mu_c": 0.9308163265306122, "calib/mu_w": 0.8127891156462586, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4599999999999999, "calib/std_conf": 0.1908712569631527, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8648687350835322, "calib/step_q_c_n": 838.0, "calib/step_q_gap": 0.054533122902053144, "calib/step_q_w": 0.8103356121814791, "calib/step_q_w_n": 1609.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 762.38671875, "completions/mean_terminated_length": 796.6162719726562, "completions/min_length": 0.0, "completions/min_terminated_length": 467.0, "epoch": 0.13973333333333332, "grad_norm": 0.04783172532916069, "kl": 0.0788421630859375, "learning_rate": 1.916666666666667e-06, "loss": -0.1903, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01806638203561306, "mask/share_reasoning": 0.8219820261001587, "mask/share_step_conf": 0.11698280274868011, "num_tokens": 41372588.0, "reward": 0.6918598413467407, "reward_std": 0.20048105716705322, "rewards/accuracy_reward_step": 0.3828125, "rewards/final_brier_reward_step": 0.5441882610321045, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.5715625286102295, "step": 131 }, { "adv/mean_abs_final_conf": 0.635727047920227, "adv/mean_abs_reasoning": 0.5477725267410278, "adv/mean_abs_step_conf": 0.7592942714691162, "adv/ratio_final_to_reasoning": 1.1605676022170819, "adv/ratio_step_to_reasoning": 1.3861488745821862, "adv/std_final_conf": 0.8477063775062561, "adv/std_reasoning": 0.7928697466850281, "adv/std_step_conf": 0.9318060874938965, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7335164835164835, "calib/avg_num_step_conf": 9.31640625, "calib/ece": 0.24838056680161935, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6639676113360324, "calib/gap": 0.1624450549450549, "calib/mean_conf": 0.8799595141700406, "calib/mu_c": 0.9398076923076922, "calib/mu_w": 0.7773626373626373, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24838056680161935, "calib/std_conf": 0.18466842586293722, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8826722925457101, "calib/step_q_c_n": 1422.0, "calib/step_q_gap": 0.05299420324145254, "calib/step_q_w": 0.8296780893042576, "calib/step_q_w_n": 963.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2868.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 777.60546875, "completions/mean_terminated_length": 799.4658203125, "completions/min_length": 0.0, "completions/min_terminated_length": 405.0, "epoch": 0.1408, "grad_norm": 0.04476737231016159, "kl": 0.0642242431640625, "learning_rate": 1.888888888888889e-06, "loss": -0.1192, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018550019711256027, "mask/share_reasoning": 0.8334195613861084, "mask/share_step_conf": 0.12068665027618408, "num_tokens": 41677247.0, "reward": 0.8809730410575867, "reward_std": 0.24961604177951813, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7208496332168579, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7262526750564575, "step": 132 }, { "adv/mean_abs_final_conf": 0.7479898929595947, "adv/mean_abs_reasoning": 0.5849982500076294, "adv/mean_abs_step_conf": 0.779026985168457, "adv/ratio_final_to_reasoning": 1.2786190265523694, "adv/ratio_step_to_reasoning": 1.331674043739955, "adv/std_final_conf": 0.908273458480835, "adv/std_reasoning": 0.8268160820007324, "adv/std_step_conf": 0.9347463250160217, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6537166900420758, "calib/avg_num_step_conf": 9.4453125, "calib/ece": 0.37397489539748946, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5941422594142259, "calib/gap": 0.11778892005610075, "calib/mean_conf": 0.8417573221757323, "calib/mu_c": 0.9028695652173911, "calib/mu_w": 0.7850806451612904, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36728033472803334, "calib/std_conf": 0.2256989323259645, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.867897435897436, "calib/step_q_c_n": 975.0, "calib/step_q_gap": 0.03466805266805284, "calib/step_q_w": 0.8332293832293831, "calib/step_q_w_n": 1443.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2521.0, "completions/max_terminated_length": 2521.0, "completions/mean_length": 816.62890625, "completions/mean_terminated_length": 871.0708618164062, "completions/min_length": 0.0, "completions/min_terminated_length": 407.0, "epoch": 0.14186666666666667, "grad_norm": 0.045341163873672485, "kl": 0.06060791015625, "learning_rate": 1.8611111111111113e-06, "loss": -0.2734, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.016313698142766953, "mask/share_reasoning": 0.8198044300079346, "mask/share_step_conf": 0.10138189792633057, "num_tokens": 41992648.0, "reward": 0.7376042604446411, "reward_std": 0.3090021014213562, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.5864866971969604, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.612159252166748, "step": 133 }, { "adv/mean_abs_final_conf": 0.6931519508361816, "adv/mean_abs_reasoning": 0.5527679920196533, "adv/mean_abs_step_conf": 0.7484570741653442, "adv/ratio_final_to_reasoning": 1.253965426441582, "adv/ratio_step_to_reasoning": 1.3540166669757776, "adv/std_final_conf": 0.8895355463027954, "adv/std_reasoning": 0.8099076151847839, "adv/std_step_conf": 0.9333269596099854, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6481481481481481, "calib/avg_num_step_conf": 8.66796875, "calib/ece": 0.2990688259109312, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5910931174089069, "calib/gap": 0.13690013227513242, "calib/mean_conf": 0.8456275303643724, "calib/mu_c": 0.9077037037037038, "calib/mu_w": 0.7708035714285714, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2990688259109312, "calib/std_conf": 0.21186350480245592, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8718744313011829, "calib/step_q_c_n": 1099.0, "calib/step_q_gap": 0.039347645586897184, "calib/step_q_w": 0.8325267857142857, "calib/step_q_w_n": 1120.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2675.0, "completions/max_terminated_length": 2675.0, "completions/mean_length": 896.453125, "completions/mean_terminated_length": 907.0830688476562, "completions/min_length": 0.0, "completions/min_terminated_length": 427.0, "epoch": 0.14293333333333333, "grad_norm": 0.06819811463356018, "kl": 0.0545196533203125, "learning_rate": 1.8333333333333333e-06, "loss": -0.0256, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.016849879175424576, "mask/share_reasoning": 0.8710737228393555, "mask/share_step_conf": 0.10035759210586548, "num_tokens": 42331092.0, "reward": 0.8029290437698364, "reward_std": 0.26459646224975586, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6504179239273071, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6601275205612183, "step": 134 }, { "adv/mean_abs_final_conf": 0.6313650012016296, "adv/mean_abs_reasoning": 0.5012447237968445, "adv/mean_abs_step_conf": 0.7458860278129578, "adv/ratio_final_to_reasoning": 1.2595943083832304, "adv/ratio_step_to_reasoning": 1.4880675893463706, "adv/std_final_conf": 0.8750061988830566, "adv/std_reasoning": 0.7929680943489075, "adv/std_step_conf": 0.9351843595504761, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.5803836492349851, "calib/avg_num_step_conf": 9.984375, "calib/ece": 0.28176470588235286, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.6470588235294118, "calib/gap": 0.0517896018877978, "calib/mean_conf": 0.8650420168067227, "calib/mu_c": 0.8839735099337745, "calib/mu_w": 0.8321839080459768, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25617647058823523, "calib/std_conf": 0.20826362555757613, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8771782945736434, "calib/step_q_c_n": 1290.0, "calib/step_q_gap": 0.051862338807450725, "calib/step_q_w": 0.8253159557661927, "calib/step_q_w_n": 1266.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2998.0, "completions/max_terminated_length": 2998.0, "completions/mean_length": 795.2421875, "completions/mean_terminated_length": 834.3524169921875, "completions/min_length": 0.0, "completions/min_terminated_length": 429.0, "epoch": 0.144, "grad_norm": 0.06078675389289856, "kl": 0.0625457763671875, "learning_rate": 1.8055555555555557e-06, "loss": -0.2651, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.01735134981572628, "mask/share_reasoning": 0.8256722688674927, "mask/share_step_conf": 0.11010138690471649, "num_tokens": 42640554.0, "reward": 0.8158265352249146, "reward_std": 0.2599346935749054, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.646649181842804, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.6810975074768066, "step": 135 }, { "adv/mean_abs_final_conf": 0.6751922965049744, "adv/mean_abs_reasoning": 0.5883626341819763, "adv/mean_abs_step_conf": 0.7381198406219482, "adv/ratio_final_to_reasoning": 1.147578478439102, "adv/ratio_step_to_reasoning": 1.2545321503092821, "adv/std_final_conf": 0.8754206299781799, "adv/std_reasoning": 0.8267235159873962, "adv/std_step_conf": 0.9343062043190002, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.6577624135763671, "calib/avg_num_step_conf": 9.9765625, "calib/ece": 0.31429166666666664, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5708333333333333, "calib/gap": 0.129585166561911, "calib/mean_conf": 0.822625, "calib/mu_c": 0.8825581395348838, "calib/mu_w": 0.7529729729729728, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2997083333333333, "calib/std_conf": 0.2318660375626409, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.870106907894737, "calib/step_q_c_n": 1216.0, "calib/step_q_gap": 0.11727432194555909, "calib/step_q_w": 0.7528325859491779, "calib/step_q_w_n": 1338.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2462.0, "completions/max_terminated_length": 2462.0, "completions/mean_length": 796.77734375, "completions/mean_terminated_length": 835.9630737304688, "completions/min_length": 0.0, "completions/min_terminated_length": 472.0, "epoch": 0.14506666666666668, "grad_norm": 0.03421531990170479, "kl": 0.06191253662109375, "learning_rate": 1.777777777777778e-06, "loss": -0.2477, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017344415187835693, "mask/share_reasoning": 0.822014570236206, "mask/share_step_conf": 0.11376601457595825, "num_tokens": 42953017.0, "reward": 0.7648307085037231, "reward_std": 0.2806330621242523, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6361578106880188, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.6067847609519958, "step": 136 }, { "adv/mean_abs_final_conf": 0.6866587400436401, "adv/mean_abs_reasoning": 0.4152359366416931, "adv/mean_abs_step_conf": 0.753272294998169, "adv/ratio_final_to_reasoning": 1.6536592318987016, "adv/ratio_step_to_reasoning": 1.8140826179217895, "adv/std_final_conf": 0.905501663684845, "adv/std_reasoning": 0.6816924810409546, "adv/std_step_conf": 0.9320009350776672, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6449728827701294, "calib/avg_num_step_conf": 9.71484375, "calib/ece": 0.23781376518218636, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5870445344129555, "calib/gap": 0.12466624947851479, "calib/mean_conf": 0.8264777327935222, "calib/mu_c": 0.8739215686274511, "calib/mu_w": 0.7492553191489363, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22242914979757097, "calib/std_conf": 0.23045137567110754, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.878910447761194, "calib/step_q_c_n": 1407.0, "calib/step_q_gap": 0.06743822553897172, "calib/step_q_w": 0.8114722222222223, "calib/step_q_w_n": 1080.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2302.0, "completions/max_terminated_length": 2302.0, "completions/mean_length": 790.46484375, "completions/mean_terminated_length": 812.6867065429688, "completions/min_length": 0.0, "completions/min_terminated_length": 489.0, "epoch": 0.14613333333333334, "grad_norm": 0.05176422744989395, "kl": 0.05794525146484375, "learning_rate": 1.75e-06, "loss": -0.1748, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017869368195533752, "mask/share_reasoning": 0.8374629020690918, "mask/share_step_conf": 0.11732391268014908, "num_tokens": 43262360.0, "reward": 0.8528109788894653, "reward_std": 0.22672004997730255, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6977546811103821, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6953673362731934, "step": 137 }, { "adv/mean_abs_final_conf": 0.6333698630332947, "adv/mean_abs_reasoning": 0.5395751595497131, "adv/mean_abs_step_conf": 0.7648397088050842, "adv/ratio_final_to_reasoning": 1.173830655143308, "adv/ratio_step_to_reasoning": 1.4174850255214846, "adv/std_final_conf": 0.8593973517417908, "adv/std_reasoning": 0.8100116848945618, "adv/std_step_conf": 0.9284457564353943, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.6983270794246403, "calib/avg_num_step_conf": 9.375, "calib/ece": 0.17621848739495807, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5714285714285714, "calib/gap": 0.1657113821138213, "calib/mean_conf": 0.8287394957983194, "calib/mu_c": 0.8858333333333334, "calib/mu_w": 0.7201219512195121, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17474789915966393, "calib/std_conf": 0.2115948031237759, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8783309352517986, "calib/step_q_c_n": 1390.0, "calib/step_q_gap": 0.051776479806253994, "calib/step_q_w": 0.8265544554455446, "calib/step_q_w_n": 1010.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2828.0, "completions/max_terminated_length": 2828.0, "completions/mean_length": 810.23828125, "completions/mean_terminated_length": 850.0859985351562, "completions/min_length": 0.0, "completions/min_terminated_length": 407.0, "epoch": 0.1472, "grad_norm": 0.05331015586853027, "kl": 0.0602569580078125, "learning_rate": 1.7222222222222224e-06, "loss": -0.2486, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.017089052125811577, "mask/share_reasoning": 0.8257807493209839, "mask/share_step_conf": 0.11025522649288177, "num_tokens": 43574117.0, "reward": 0.8627781271934509, "reward_std": 0.25776857137680054, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7197796702384949, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.6979640126228333, "step": 138 }, { "adv/mean_abs_final_conf": 0.6163228750228882, "adv/mean_abs_reasoning": 0.399943470954895, "adv/mean_abs_step_conf": 0.7604862451553345, "adv/ratio_final_to_reasoning": 1.5410249692322044, "adv/ratio_step_to_reasoning": 1.9014843356227733, "adv/std_final_conf": 0.8437338471412659, "adv/std_reasoning": 0.7014421820640564, "adv/std_step_conf": 0.9300811886787415, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7263708513708513, "calib/avg_num_step_conf": 8.6640625, "calib/ece": 0.19538152610441772, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.642570281124498, "calib/gap": 0.1927705627705626, "calib/mean_conf": 0.8467871485943775, "calib/mu_c": 0.9118181818181817, "calib/mu_w": 0.7190476190476192, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18975903614457834, "calib/std_conf": 0.2147047072627546, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8946872803935347, "calib/step_q_c_n": 1423.0, "calib/step_q_gap": 0.023190425047622698, "calib/step_q_w": 0.871496855345912, "calib/step_q_w_n": 795.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2639.0, "completions/max_terminated_length": 2639.0, "completions/mean_length": 779.59375, "completions/mean_terminated_length": 791.9683227539062, "completions/min_length": 0.0, "completions/min_terminated_length": 406.0, "epoch": 0.14826666666666666, "grad_norm": 0.050895802676677704, "kl": 0.05617523193359375, "learning_rate": 1.6944444444444446e-06, "loss": -0.0886, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01869884878396988, "mask/share_reasoning": 0.8509533405303955, "mask/share_step_conf": 0.11472281068563461, "num_tokens": 43876789.0, "reward": 0.9184540510177612, "reward_std": 0.2043038308620453, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.761236310005188, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7522342801094055, "step": 139 }, { "adv/mean_abs_final_conf": 0.646989643573761, "adv/mean_abs_reasoning": 0.4953835606575012, "adv/mean_abs_step_conf": 0.736619234085083, "adv/ratio_final_to_reasoning": 1.306037775486614, "adv/ratio_step_to_reasoning": 1.4869674583213865, "adv/std_final_conf": 0.8599854707717896, "adv/std_reasoning": 0.7575750350952148, "adv/std_step_conf": 0.9302241206169128, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6341016057585825, "calib/avg_num_step_conf": 8.796875, "calib/ece": 0.21019685039370073, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.610236220472441, "calib/gap": 0.10283914728682175, "calib/mean_conf": 0.8447637795275592, "calib/mu_c": 0.8795833333333334, "calib/mu_w": 0.7767441860465116, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19677165354330706, "calib/std_conf": 0.20869425161364966, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8876229508196721, "calib/step_q_c_n": 1464.0, "calib/step_q_gap": 0.06901888990596661, "calib/step_q_w": 0.8186040609137055, "calib/step_q_w_n": 788.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1631.0, "completions/max_terminated_length": 1631.0, "completions/mean_length": 771.68359375, "completions/mean_terminated_length": 777.7598266601562, "completions/min_length": 0.0, "completions/min_terminated_length": 398.0, "epoch": 0.14933333333333335, "grad_norm": 0.07742151618003845, "kl": 0.06075286865234375, "learning_rate": 1.6666666666666667e-06, "loss": -0.0299, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018998388200998306, "mask/share_reasoning": 0.8548575639724731, "mask/share_step_conf": 0.11833152920007706, "num_tokens": 44179356.0, "reward": 0.905888020992279, "reward_std": 0.22323060035705566, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7361972332000732, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7466724514961243, "step": 140 }, { "adv/mean_abs_final_conf": 0.5693894624710083, "adv/mean_abs_reasoning": 0.4516020119190216, "adv/mean_abs_step_conf": 0.7310876846313477, "adv/ratio_final_to_reasoning": 1.2608213591685848, "adv/ratio_step_to_reasoning": 1.6188760575372318, "adv/std_final_conf": 0.8129839301109314, "adv/std_reasoning": 0.739275336265564, "adv/std_step_conf": 0.9309452176094055, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7155354449472096, "calib/avg_num_step_conf": 8.515625, "calib/ece": 0.16854838709677428, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6411290322580645, "calib/gap": 0.19733031674208135, "calib/mean_conf": 0.8391129032258066, "calib/mu_c": 0.9011764705882352, "calib/mu_w": 0.7038461538461539, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16108870967741942, "calib/std_conf": 0.2282843165623637, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8867753366406802, "calib/step_q_c_n": 1411.0, "calib/step_q_gap": 0.05806272285654501, "calib/step_q_w": 0.8287126137841352, "calib/step_q_w_n": 769.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2123.0, "completions/max_terminated_length": 2123.0, "completions/mean_length": 781.87109375, "completions/mean_terminated_length": 803.8513793945312, "completions/min_length": 0.0, "completions/min_terminated_length": 429.0, "epoch": 0.1504, "grad_norm": 0.041482020169496536, "kl": 0.054931640625, "learning_rate": 1.638888888888889e-06, "loss": -0.1472, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018007446080446243, "mask/share_reasoning": 0.8456077575683594, "mask/share_step_conf": 0.10904105007648468, "num_tokens": 44486611.0, "reward": 0.9167698621749878, "reward_std": 0.2230062186717987, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7688156366348267, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7389428615570068, "step": 141 }, { "adv/mean_abs_final_conf": 0.6314700841903687, "adv/mean_abs_reasoning": 0.42496341466903687, "adv/mean_abs_step_conf": 0.7561836242675781, "adv/ratio_final_to_reasoning": 1.4859398771589785, "adv/ratio_step_to_reasoning": 1.7794087635908535, "adv/std_final_conf": 0.8592272400856018, "adv/std_reasoning": 0.7013537883758545, "adv/std_step_conf": 0.9269928932189941, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7295842732839187, "calib/avg_num_step_conf": 8.57421875, "calib/ece": 0.23301587301587293, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5634920634920635, "calib/gap": 0.21156815984531108, "calib/mean_conf": 0.8028571428571429, "calib/mu_c": 0.8926896551724138, "calib/mu_w": 0.6811214953271028, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23023809523809516, "calib/std_conf": 0.24811465969132396, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.892886345053148, "calib/step_q_c_n": 1223.0, "calib/step_q_gap": 0.04764971953874475, "calib/step_q_w": 0.8452366255144033, "calib/step_q_w_n": 972.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 858.42578125, "completions/mean_terminated_length": 861.792236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 305.0, "epoch": 0.15146666666666667, "grad_norm": 0.0425986647605896, "kl": 0.0515899658203125, "learning_rate": 1.6111111111111113e-06, "loss": 0.0064, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018087752163410187, "mask/share_reasoning": 0.8661826848983765, "mask/share_step_conf": 0.11182329803705215, "num_tokens": 44811528.0, "reward": 0.8665826320648193, "reward_std": 0.1983642578125, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7341116666793823, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.6888971924781799, "step": 142 }, { "adv/mean_abs_final_conf": 0.6143150925636292, "adv/mean_abs_reasoning": 0.43332940340042114, "adv/mean_abs_step_conf": 0.7397428750991821, "adv/ratio_final_to_reasoning": 1.4176630705024347, "adv/ratio_step_to_reasoning": 1.7071144244869472, "adv/std_final_conf": 0.8298029899597168, "adv/std_reasoning": 0.7206790447235107, "adv/std_step_conf": 0.9270122647285461, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7693678870149457, "calib/avg_num_step_conf": 8.921875, "calib/ece": 0.26355102040816325, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6571428571428571, "calib/gap": 0.24391128479363777, "calib/mean_conf": 0.8357959183673469, "calib/mu_c": 0.9373426573426571, "calib/mu_w": 0.6934313725490193, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2578367346938775, "calib/std_conf": 0.24057897677264514, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8957201646090537, "calib/step_q_c_n": 1215.0, "calib/step_q_gap": 0.08715140876246807, "calib/step_q_w": 0.8085687558465856, "calib/step_q_w_n": 1069.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3060.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 793.13671875, "completions/mean_terminated_length": 825.3779907226562, "completions/min_length": 0.0, "completions/min_terminated_length": 403.0, "epoch": 0.15253333333333333, "grad_norm": 0.05409567803144455, "kl": 0.052845001220703125, "learning_rate": 1.5833333333333333e-06, "loss": -0.1822, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.017870299518108368, "mask/share_reasoning": 0.8328666687011719, "mask/share_step_conf": 0.11020059883594513, "num_tokens": 45121907.0, "reward": 0.8470816016197205, "reward_std": 0.2363058477640152, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7216949462890625, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6693432331085205, "step": 143 }, { "adv/mean_abs_final_conf": 0.5318283438682556, "adv/mean_abs_reasoning": 0.3669816851615906, "adv/mean_abs_step_conf": 0.7277644872665405, "adv/ratio_final_to_reasoning": 1.449195873723451, "adv/ratio_step_to_reasoning": 1.983108467514091, "adv/std_final_conf": 0.7739172577857971, "adv/std_reasoning": 0.6612470746040344, "adv/std_step_conf": 0.9174556732177734, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6250771604938272, "calib/avg_num_step_conf": 8.6953125, "calib/ece": 0.18960317460317455, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6904761904761905, "calib/gap": 0.11563888888888885, "calib/mean_conf": 0.8591269841269841, "calib/mu_c": 0.8921666666666666, "calib/mu_w": 0.7765277777777777, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1672222222222222, "calib/std_conf": 0.22965819958568048, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8850234113712375, "calib/step_q_c_n": 1495.0, "calib/step_q_gap": 0.027362672657147202, "calib/step_q_w": 0.8576607387140903, "calib/step_q_w_n": 731.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2923.0, "completions/max_terminated_length": 2923.0, "completions/mean_length": 819.66015625, "completions/mean_terminated_length": 826.1141967773438, "completions/min_length": 0.0, "completions/min_terminated_length": 436.0, "epoch": 0.1536, "grad_norm": 0.06142843887209892, "kl": 0.050872802734375, "learning_rate": 1.5555555555555558e-06, "loss": 0.0024, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018356967717409134, "mask/share_reasoning": 0.8610062003135681, "mask/share_step_conf": 0.11282435059547424, "num_tokens": 45435868.0, "reward": 0.9260880947113037, "reward_std": 0.1764068305492401, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7573742270469666, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.757301926612854, "step": 144 }, { "adv/mean_abs_final_conf": 0.6020931601524353, "adv/mean_abs_reasoning": 0.4983627200126648, "adv/mean_abs_step_conf": 0.7491400241851807, "adv/ratio_final_to_reasoning": 1.2081424552324749, "adv/ratio_step_to_reasoning": 1.5032023747003846, "adv/std_final_conf": 0.8259950876235962, "adv/std_reasoning": 0.7576925754547119, "adv/std_step_conf": 0.9257418513298035, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6714731427607723, "calib/avg_num_step_conf": 8.71484375, "calib/ece": 0.16372540983606557, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7008196721311475, "calib/gap": 0.16040500885403486, "calib/mean_conf": 0.8539713114754097, "calib/mu_c": 0.8980169491525425, "calib/mu_w": 0.7376119402985076, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14614344262295081, "calib/std_conf": 0.23767791264771557, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8855218525766471, "calib/step_q_c_n": 1533.0, "calib/step_q_gap": 0.03201182392335189, "calib/step_q_w": 0.8535100286532952, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2981.0, "completions/max_terminated_length": 2981.0, "completions/mean_length": 812.96484375, "completions/mean_terminated_length": 829.1593627929688, "completions/min_length": 0.0, "completions/min_terminated_length": 444.0, "epoch": 0.15466666666666667, "grad_norm": 0.044827286154031754, "kl": 0.05010986328125, "learning_rate": 1.527777777777778e-06, "loss": -0.0536, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018157798796892166, "mask/share_reasoning": 0.8492351770401001, "mask/share_step_conf": 0.11307579278945923, "num_tokens": 45746691.0, "reward": 0.9063776731491089, "reward_std": 0.23287507891654968, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.754582405090332, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7284855246543884, "step": 145 }, { "adv/mean_abs_final_conf": 0.6597222089767456, "adv/mean_abs_reasoning": 0.5190834999084473, "adv/mean_abs_step_conf": 0.7232632637023926, "adv/ratio_final_to_reasoning": 1.2709365816734752, "adv/ratio_step_to_reasoning": 1.393346665478593, "adv/std_final_conf": 0.8603639602661133, "adv/std_reasoning": 0.7928233742713928, "adv/std_step_conf": 0.927862823009491, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.6576347826086957, "calib/avg_num_step_conf": 8.91015625, "calib/ece": 0.3995833333333334, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.7333333333333333, "calib/gap": 0.1361878260869568, "calib/mean_conf": 0.8754166666666667, "calib/mu_c": 0.9463478260869566, "calib/mu_w": 0.8101599999999998, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39791666666666675, "calib/std_conf": 0.21381025791315272, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8873650793650794, "calib/step_q_c_n": 945.0, "calib/step_q_gap": 0.04676627696986979, "calib/step_q_w": 0.8405988023952096, "calib/step_q_w_n": 1336.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 791.56640625, "completions/mean_terminated_length": 830.495849609375, "completions/min_length": 0.0, "completions/min_terminated_length": 452.0, "epoch": 0.15573333333333333, "grad_norm": 0.05562068521976471, "kl": 0.047908782958984375, "learning_rate": 1.5e-06, "loss": -0.1804, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017334166914224625, "mask/share_reasoning": 0.8294928073883057, "mask/share_step_conf": 0.10629801452159882, "num_tokens": 46056548.0, "reward": 0.7173329591751099, "reward_std": 0.26008641719818115, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.5772007703781128, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.5801214575767517, "step": 146 }, { "adv/mean_abs_final_conf": 0.6195129156112671, "adv/mean_abs_reasoning": 0.4309051036834717, "adv/mean_abs_step_conf": 0.752669095993042, "adv/ratio_final_to_reasoning": 1.4377015039170673, "adv/ratio_step_to_reasoning": 1.7467165962042708, "adv/std_final_conf": 0.8434000611305237, "adv/std_reasoning": 0.7393536567687988, "adv/std_step_conf": 0.9289423227310181, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6683632998413539, "calib/avg_num_step_conf": 9.57421875, "calib/ece": 0.3651626016260162, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7560975609756098, "calib/gap": 0.18607879428873597, "calib/mean_conf": 0.861910569105691, "calib/mu_c": 0.9541935483870967, "calib/mu_w": 0.7681147540983607, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36150406504065036, "calib/std_conf": 0.25249166477846935, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8937033582089552, "calib/step_q_c_n": 1072.0, "calib/step_q_gap": 0.05780778170424161, "calib/step_q_w": 0.8358955765047136, "calib/step_q_w_n": 1379.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2775.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 816.8359375, "completions/mean_terminated_length": 843.1854858398438, "completions/min_length": 0.0, "completions/min_terminated_length": 447.0, "epoch": 0.1568, "grad_norm": 0.06812643259763718, "kl": 0.04569244384765625, "learning_rate": 1.4722222222222225e-06, "loss": -0.1596, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017271030694246292, "mask/share_reasoning": 0.8407407999038696, "mask/share_step_conf": 0.11073816567659378, "num_tokens": 46369338.0, "reward": 0.7616201639175415, "reward_std": 0.2501278519630432, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6258050203323364, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.608372688293457, "step": 147 }, { "adv/mean_abs_final_conf": 0.6027733087539673, "adv/mean_abs_reasoning": 0.46370819211006165, "adv/mean_abs_step_conf": 0.739616334438324, "adv/ratio_final_to_reasoning": 1.2998979078008144, "adv/ratio_step_to_reasoning": 1.5950038127917636, "adv/std_final_conf": 0.8275401592254639, "adv/std_reasoning": 0.7396039366722107, "adv/std_step_conf": 0.9280713200569153, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.645531028095091, "calib/avg_num_step_conf": 8.54296875, "calib/ece": 0.24541666666666673, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.7958333333333333, "calib/gap": 0.16023155294844071, "calib/mean_conf": 0.8865833333333335, "calib/mu_c": 0.9413291139240504, "calib/mu_w": 0.7810975609756097, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2368333333333334, "calib/std_conf": 0.2259258574301657, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8914390065741417, "calib/step_q_c_n": 1369.0, "calib/step_q_gap": 0.06576663493599988, "calib/step_q_w": 0.8256723716381418, "calib/step_q_w_n": 818.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2929.0, "completions/max_terminated_length": 2929.0, "completions/mean_length": 807.80078125, "completions/mean_terminated_length": 837.23486328125, "completions/min_length": 0.0, "completions/min_terminated_length": 365.0, "epoch": 0.15786666666666666, "grad_norm": 0.039087146520614624, "kl": 0.045024871826171875, "learning_rate": 1.4444444444444445e-06, "loss": -0.181, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.018149036914110184, "mask/share_reasoning": 0.8342307806015015, "mask/share_step_conf": 0.11246395111083984, "num_tokens": 46681247.0, "reward": 0.8411773443222046, "reward_std": 0.27904221415519714, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6974323987960815, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.6747660636901855, "step": 148 }, { "adv/mean_abs_final_conf": 0.5810620188713074, "adv/mean_abs_reasoning": 0.4962088465690613, "adv/mean_abs_step_conf": 0.7387970685958862, "adv/ratio_final_to_reasoning": 1.1710029413803213, "adv/ratio_step_to_reasoning": 1.4888833073093186, "adv/std_final_conf": 0.7941713929176331, "adv/std_reasoning": 0.7576331496238708, "adv/std_step_conf": 0.9306121468544006, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7055350805350806, "calib/avg_num_step_conf": 8.40234375, "calib/ece": 0.29906882591093126, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8137651821862348, "calib/gap": 0.2109991809991809, "calib/mean_conf": 0.8921862348178138, "calib/mu_c": 0.9767567567567566, "calib/mu_w": 0.7657575757575756, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2960323886639677, "calib/std_conf": 0.225261156878797, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8979166666666668, "calib/step_q_c_n": 1224.0, "calib/step_q_gap": 0.05478829557713061, "calib/step_q_w": 0.8431283710895362, "calib/step_q_w_n": 927.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2358.0, "completions/max_terminated_length": 2358.0, "completions/mean_length": 807.56640625, "completions/mean_terminated_length": 830.26904296875, "completions/min_length": 0.0, "completions/min_terminated_length": 430.0, "epoch": 0.15893333333333334, "grad_norm": 0.11413798481225967, "kl": 0.041881561279296875, "learning_rate": 1.4166666666666667e-06, "loss": -0.1408, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017695922404527664, "mask/share_reasoning": 0.8483445644378662, "mask/share_step_conf": 0.10661575198173523, "num_tokens": 46992440.0, "reward": 0.8469992876052856, "reward_std": 0.26091471314430237, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6991230249404907, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6862818002700806, "step": 149 }, { "adv/mean_abs_final_conf": 0.5980448722839355, "adv/mean_abs_reasoning": 0.520313024520874, "adv/mean_abs_step_conf": 0.7211383581161499, "adv/ratio_final_to_reasoning": 1.1493943916446072, "adv/ratio_step_to_reasoning": 1.3859702220220305, "adv/std_final_conf": 0.8278406262397766, "adv/std_reasoning": 0.7754331827163696, "adv/std_step_conf": 0.9302837252616882, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6322846283783783, "calib/avg_num_step_conf": 9.05078125, "calib/ece": 0.28569672131147544, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.8114754098360656, "calib/gap": 0.1797494369369369, "calib/mean_conf": 0.8795491803278688, "calib/mu_c": 0.9502702702702702, "calib/mu_w": 0.7705208333333333, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.279344262295082, "calib/std_conf": 0.25590766144813387, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8999468892261002, "calib/step_q_c_n": 1318.0, "calib/step_q_gap": 0.08838532766453866, "calib/step_q_w": 0.8115615615615616, "calib/step_q_w_n": 999.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3035.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 794.09375, "completions/mean_terminated_length": 816.4176635742188, "completions/min_length": 0.0, "completions/min_terminated_length": 382.0, "epoch": 0.16, "grad_norm": 0.03058474138379097, "kl": 0.044178009033203125, "learning_rate": 1.3888888888888892e-06, "loss": -0.1441, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018817484378814697, "mask/share_reasoning": 0.8333878517150879, "mask/share_step_conf": 0.12045087665319443, "num_tokens": 47300688.0, "reward": 0.8491833209991455, "reward_std": 0.2430640459060669, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6739871501922607, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7181295156478882, "step": 150 }, { "adv/mean_abs_final_conf": 0.6749951839447021, "adv/mean_abs_reasoning": 0.6116622090339661, "adv/mean_abs_step_conf": 0.7771894335746765, "adv/ratio_final_to_reasoning": 1.103542402939625, "adv/ratio_step_to_reasoning": 1.2706186880535537, "adv/std_final_conf": 0.8708445429801941, "adv/std_reasoning": 0.8267552852630615, "adv/std_step_conf": 0.9338359832763672, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6486131801092431, "calib/avg_num_step_conf": 8.515625, "calib/ece": 0.37487394957983183, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.6302521008403361, "calib/gap": 0.1708292544513017, "calib/mean_conf": 0.7660504201680672, "calib/mu_c": 0.857207207207207, "calib/mu_w": 0.6863779527559053, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3372689075630251, "calib/std_conf": 0.3341632431732412, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8933407079646017, "calib/step_q_c_n": 904.0, "calib/step_q_gap": 0.08486892113074584, "calib/step_q_w": 0.8084717868338559, "calib/step_q_w_n": 1276.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2328.0, "completions/max_terminated_length": 2328.0, "completions/mean_length": 790.69921875, "completions/mean_terminated_length": 843.4125366210938, "completions/min_length": 0.0, "completions/min_terminated_length": 448.0, "epoch": 0.16106666666666666, "grad_norm": 0.05278412252664566, "kl": 0.04607391357421875, "learning_rate": 1.3611111111111112e-06, "loss": -0.2397, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.01703723706305027, "mask/share_reasoning": 0.8188620805740356, "mask/share_step_conf": 0.10160071402788162, "num_tokens": 47610131.0, "reward": 0.7299396395683289, "reward_std": 0.29607152938842773, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.590067982673645, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.5971550345420837, "step": 151 }, { "adv/mean_abs_final_conf": 0.6524473428726196, "adv/mean_abs_reasoning": 0.5343358516693115, "adv/mean_abs_step_conf": 0.7500141859054565, "adv/ratio_final_to_reasoning": 1.2210435456170825, "adv/ratio_step_to_reasoning": 1.4036381492320742, "adv/std_final_conf": 0.8601976037025452, "adv/std_reasoning": 0.792835533618927, "adv/std_step_conf": 0.9345920085906982, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6524177814938684, "calib/avg_num_step_conf": 8.8359375, "calib/ece": 0.27661157024793376, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6446280991735537, "calib/gap": 0.21769091415830555, "calib/mean_conf": 0.7511570247933883, "calib/mu_c": 0.8447101449275364, "calib/mu_w": 0.6270192307692308, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22876033057851228, "calib/std_conf": 0.3624006049715013, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8907277397260275, "calib/step_q_c_n": 1168.0, "calib/step_q_gap": 0.057400500237910435, "calib/step_q_w": 0.833327239488117, "calib/step_q_w_n": 1094.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2731.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 833.3515625, "completions/mean_terminated_length": 853.35205078125, "completions/min_length": 0.0, "completions/min_terminated_length": 465.0, "epoch": 0.16213333333333332, "grad_norm": 0.053785111755132675, "kl": 0.045932769775390625, "learning_rate": 1.3333333333333334e-06, "loss": -0.1069, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.017900928854942322, "mask/share_reasoning": 0.8457293510437012, "mask/share_step_conf": 0.11293217539787292, "num_tokens": 47928861.0, "reward": 0.8092750310897827, "reward_std": 0.25778883695602417, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6594210863113403, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.6622539758682251, "step": 152 }, { "adv/mean_abs_final_conf": 0.5960100293159485, "adv/mean_abs_reasoning": 0.4430437684059143, "adv/mean_abs_step_conf": 0.7543562054634094, "adv/ratio_final_to_reasoning": 1.3452621881138553, "adv/ratio_step_to_reasoning": 1.7026674546797198, "adv/std_final_conf": 0.8276448845863342, "adv/std_reasoning": 0.7207258939743042, "adv/std_step_conf": 0.9289489388465881, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.654132082280765, "calib/avg_num_step_conf": 8.53515625, "calib/ece": 0.20362903225806445, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6733870967741935, "calib/gap": 0.23280692890653176, "calib/mean_conf": 0.7826612903225807, "calib/mu_c": 0.8624539877300612, "calib/mu_w": 0.6296470588235294, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16451612903225799, "calib/std_conf": 0.33721011175423377, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.883667409057164, "calib/step_q_c_n": 1347.0, "calib/step_q_gap": 0.017247361324467025, "calib/step_q_w": 0.866420047732697, "calib/step_q_w_n": 838.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3001.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 789.28125, "completions/mean_terminated_length": 808.2240600585938, "completions/min_length": 0.0, "completions/min_terminated_length": 412.0, "epoch": 0.1632, "grad_norm": 0.0818745419383049, "kl": 0.05124664306640625, "learning_rate": 1.3055555555555556e-06, "loss": -0.0529, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018392134457826614, "mask/share_reasoning": 0.845497727394104, "mask/share_step_conf": 0.11267266422510147, "num_tokens": 48238237.0, "reward": 0.879777729511261, "reward_std": 0.21167093515396118, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7267390489578247, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7117226123809814, "step": 153 }, { "adv/mean_abs_final_conf": 0.6036090850830078, "adv/mean_abs_reasoning": 0.3608783185482025, "adv/mean_abs_step_conf": 0.7696006894111633, "adv/ratio_final_to_reasoning": 1.6726111103357508, "adv/ratio_step_to_reasoning": 2.132576688195713, "adv/std_final_conf": 0.8111317753791809, "adv/std_reasoning": 0.640428900718689, "adv/std_step_conf": 0.9313547611236572, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7188457950153272, "calib/avg_num_step_conf": 8.26171875, "calib/ece": 0.30257142857142855, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6, "calib/gap": 0.2740690390510465, "calib/mean_conf": 0.7428979591836734, "calib/mu_c": 0.8804918032786887, "calib/mu_w": 0.6064227642276422, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2737551020408163, "calib/std_conf": 0.3528487127331878, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8891649694501019, "calib/step_q_c_n": 982.0, "calib/step_q_gap": 0.02997697298055202, "calib/step_q_w": 0.8591879964695499, "calib/step_q_w_n": 1133.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2919.0, "completions/max_terminated_length": 2919.0, "completions/mean_length": 771.3828125, "completions/mean_terminated_length": 796.26611328125, "completions/min_length": 0.0, "completions/min_terminated_length": 402.0, "epoch": 0.16426666666666667, "grad_norm": 0.08071709424257278, "kl": 0.051548004150390625, "learning_rate": 1.2777777777777779e-06, "loss": -0.0806, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018711235374212265, "mask/share_reasoning": 0.8364302515983582, "mask/share_step_conf": 0.11360852420330048, "num_tokens": 48540151.0, "reward": 0.7974370121955872, "reward_std": 0.22275030612945557, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6723519563674927, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6358032822608948, "step": 154 }, { "adv/mean_abs_final_conf": 0.753005862236023, "adv/mean_abs_reasoning": 0.5864537358283997, "adv/mean_abs_step_conf": 0.7431141138076782, "adv/ratio_final_to_reasoning": 1.2839987474414478, "adv/ratio_step_to_reasoning": 1.267131690717234, "adv/std_final_conf": 0.9215671420097351, "adv/std_reasoning": 0.8266345858573914, "adv/std_step_conf": 0.9348805546760559, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6045225155279503, "calib/avg_num_step_conf": 8.82421875, "calib/ece": 0.2928, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.524, "calib/gap": 0.17288819875776396, "calib/mean_conf": 0.67472, "calib/mu_c": 0.7521739130434784, "calib/mu_w": 0.5792857142857144, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20776000000000003, "calib/std_conf": 0.3875060278240843, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8760777683854608, "calib/step_q_c_n": 1183.0, "calib/step_q_gap": 0.043670705188434744, "calib/step_q_w": 0.8324070631970261, "calib/step_q_w_n": 1076.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 794.5625, "completions/mean_terminated_length": 807.1746215820312, "completions/min_length": 0.0, "completions/min_terminated_length": 409.0, "epoch": 0.16533333333333333, "grad_norm": 0.05100875720381737, "kl": 0.059783935546875, "learning_rate": 1.25e-06, "loss": -0.0423, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01870831474661827, "mask/share_reasoning": 0.8480067253112793, "mask/share_step_conf": 0.11765995621681213, "num_tokens": 48850775.0, "reward": 0.8101447820663452, "reward_std": 0.26722443103790283, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6571409702301025, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6608047485351562, "step": 155 }, { "adv/mean_abs_final_conf": 0.7030758857727051, "adv/mean_abs_reasoning": 0.4742887020111084, "adv/mean_abs_step_conf": 0.7566878795623779, "adv/ratio_final_to_reasoning": 1.4823795776527653, "adv/ratio_step_to_reasoning": 1.5954162018910065, "adv/std_final_conf": 0.906621515750885, "adv/std_reasoning": 0.7392860054969788, "adv/std_step_conf": 0.9321083426475525, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6082010582010583, "calib/avg_num_step_conf": 8.74609375, "calib/ece": 0.3310526315789473, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4493927125506073, "calib/gap": 0.1552129629629631, "calib/mean_conf": 0.5785829959514168, "calib/mu_c": 0.6489629629629631, "calib/mu_w": 0.49374999999999997, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1815384615384615, "calib/std_conf": 0.4222496337868507, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8693238731218699, "calib/step_q_c_n": 1198.0, "calib/step_q_gap": 0.02888679339084199, "calib/step_q_w": 0.8404370797310279, "calib/step_q_w_n": 1041.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2943.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 819.390625, "completions/mean_terminated_length": 829.1067504882812, "completions/min_length": 0.0, "completions/min_terminated_length": 364.0, "epoch": 0.1664, "grad_norm": 0.07828212529420853, "kl": 0.05800628662109375, "learning_rate": 1.2222222222222223e-06, "loss": 0.0139, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01829274743795395, "mask/share_reasoning": 0.8516146540641785, "mask/share_step_conf": 0.11837387084960938, "num_tokens": 49165299.0, "reward": 0.79437255859375, "reward_std": 0.21283696591854095, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6228859424591064, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6689841747283936, "step": 156 }, { "adv/mean_abs_final_conf": 0.74039626121521, "adv/mean_abs_reasoning": 0.4796062111854553, "adv/mean_abs_step_conf": 0.7580063343048096, "adv/ratio_final_to_reasoning": 1.5437587002577655, "adv/ratio_step_to_reasoning": 1.5804764755469394, "adv/std_final_conf": 0.906609833240509, "adv/std_reasoning": 0.7575482726097107, "adv/std_step_conf": 0.933040201663971, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6703153316418993, "calib/avg_num_step_conf": 9.07421875, "calib/ece": 0.26471311475409837, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5040983606557377, "calib/gap": 0.28305255527365, "calib/mean_conf": 0.6073360655737705, "calib/mu_c": 0.7105806451612904, "calib/mu_w": 0.4275280898876404, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11840163934426229, "calib/std_conf": 0.42741583603792177, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8700872093023256, "calib/step_q_c_n": 1376.0, "calib/step_q_gap": 0.11039343950295932, "calib/step_q_w": 0.7596937697993663, "calib/step_q_w_n": 947.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3025.0, "completions/max_terminated_length": 3025.0, "completions/mean_length": 815.16796875, "completions/mean_terminated_length": 841.4636840820312, "completions/min_length": 0.0, "completions/min_terminated_length": 388.0, "epoch": 0.16746666666666668, "grad_norm": 0.07636091113090515, "kl": 0.06465911865234375, "learning_rate": 1.1944444444444446e-06, "loss": -0.0861, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017809420824050903, "mask/share_reasoning": 0.8347960710525513, "mask/share_step_conf": 0.11614448577165604, "num_tokens": 49477710.0, "reward": 0.8488029837608337, "reward_std": 0.23808994889259338, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6824370622634888, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7026687860488892, "step": 157 }, { "adv/mean_abs_final_conf": 0.7005129456520081, "adv/mean_abs_reasoning": 0.5425291061401367, "adv/mean_abs_step_conf": 0.7504801154136658, "adv/ratio_final_to_reasoning": 1.2911988273511426, "adv/ratio_step_to_reasoning": 1.3832992680392981, "adv/std_final_conf": 0.8809415698051453, "adv/std_reasoning": 0.7928447127342224, "adv/std_step_conf": 0.9298225045204163, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6185819940264542, "calib/avg_num_step_conf": 9.46875, "calib/ece": 0.3114979757085021, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.46153846153846156, "calib/gap": 0.20209216327691654, "calib/mean_conf": 0.5668016194331984, "calib/mu_c": 0.639620253164557, "calib/mu_w": 0.4375280898876404, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11931174089068827, "calib/std_conf": 0.435395363535799, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8779732582688247, "calib/step_q_c_n": 1421.0, "calib/step_q_gap": 0.05564025727181576, "calib/step_q_w": 0.8223330009970089, "calib/step_q_w_n": 1003.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2820.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 751.07421875, "completions/mean_terminated_length": 775.3023681640625, "completions/min_length": 0.0, "completions/min_terminated_length": 378.0, "epoch": 0.16853333333333334, "grad_norm": 0.10405830293893814, "kl": 0.0607147216796875, "learning_rate": 1.1666666666666668e-06, "loss": -0.1067, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01923152431845665, "mask/share_reasoning": 0.8270524740219116, "mask/share_step_conf": 0.12246599793434143, "num_tokens": 49775225.0, "reward": 0.8258012533187866, "reward_std": 0.2111181616783142, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6408855319023132, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.695091962814331, "step": 158 }, { "adv/mean_abs_final_conf": 0.6621556282043457, "adv/mean_abs_reasoning": 0.5811421275138855, "adv/mean_abs_step_conf": 0.7509884238243103, "adv/ratio_final_to_reasoning": 1.1394039372726847, "adv/ratio_step_to_reasoning": 1.2922629220446018, "adv/std_final_conf": 0.8604494333267212, "adv/std_reasoning": 0.8266856074333191, "adv/std_step_conf": 0.9339320063591003, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7197873799725651, "calib/avg_num_step_conf": 9.2890625, "calib/ece": 0.2325102880658435, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.42386831275720166, "calib/gap": 0.34694444444444444, "calib/mean_conf": 0.5387654320987654, "calib/mu_c": 0.6929629629629629, "calib/mu_w": 0.34601851851851845, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1078600823045266, "calib/std_conf": 0.43246460703267314, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8845020990764064, "calib/step_q_c_n": 1191.0, "calib/step_q_gap": 0.08339847649847876, "calib/step_q_w": 0.8011036225779277, "calib/step_q_w_n": 1187.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3027.0, "completions/max_terminated_length": 3027.0, "completions/mean_length": 792.6640625, "completions/mean_terminated_length": 818.2338256835938, "completions/min_length": 0.0, "completions/min_terminated_length": 394.0, "epoch": 0.1696, "grad_norm": 0.04640193283557892, "kl": 0.06124114990234375, "learning_rate": 1.138888888888889e-06, "loss": -0.1201, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.018069058656692505, "mask/share_reasoning": 0.8351427316665649, "mask/share_step_conf": 0.11553823947906494, "num_tokens": 50082931.0, "reward": 0.8521698117256165, "reward_std": 0.22859559953212738, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6974222660064697, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7123861312866211, "step": 159 }, { "adv/mean_abs_final_conf": 0.6154712438583374, "adv/mean_abs_reasoning": 0.47096890211105347, "adv/mean_abs_step_conf": 0.7640544772148132, "adv/ratio_final_to_reasoning": 1.3068192848818085, "adv/ratio_step_to_reasoning": 1.6223034552600901, "adv/std_final_conf": 0.811108410358429, "adv/std_reasoning": 0.7395078539848328, "adv/std_step_conf": 0.9293590188026428, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.6589110707803993, "calib/avg_num_step_conf": 9.4609375, "calib/ece": 0.29712500000000003, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.5083333333333333, "calib/gap": 0.2579274047186933, "calib/mean_conf": 0.5860416666666667, "calib/mu_c": 0.6881379310344827, "calib/mu_w": 0.43021052631578943, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1395, "calib/std_conf": 0.439617919255523, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.860059335443038, "calib/step_q_c_n": 1264.0, "calib/step_q_gap": 0.12134603665201893, "calib/step_q_w": 0.7387132987910191, "calib/step_q_w_n": 1158.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3043.0, "completions/max_terminated_length": 3043.0, "completions/mean_length": 819.359375, "completions/mean_terminated_length": 849.214599609375, "completions/min_length": 0.0, "completions/min_terminated_length": 493.0, "epoch": 0.17066666666666666, "grad_norm": 0.15533457696437836, "kl": 0.059192657470703125, "learning_rate": 1.111111111111111e-06, "loss": -0.1435, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017590470612049103, "mask/share_reasoning": 0.8359103798866272, "mask/share_step_conf": 0.1113428920507431, "num_tokens": 50397527.0, "reward": 0.8208951354026794, "reward_std": 0.21404847502708435, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6474605202674866, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.693548321723938, "step": 160 }, { "adv/mean_abs_final_conf": 0.5511636734008789, "adv/mean_abs_reasoning": 0.3422989547252655, "adv/mean_abs_step_conf": 0.7528557777404785, "adv/ratio_final_to_reasoning": 1.6101821691020106, "adv/ratio_step_to_reasoning": 2.1994100985342837, "adv/std_final_conf": 0.7935216426849365, "adv/std_reasoning": 0.640374481678009, "adv/std_step_conf": 0.9291737079620361, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6562388591800358, "calib/avg_num_step_conf": 8.98828125, "calib/ece": 0.271902834008097, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5951417004048583, "calib/gap": 0.2708288770053475, "calib/mean_conf": 0.6700404858299596, "calib/mu_c": 0.7358288770053476, "calib/mu_w": 0.4650000000000001, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09242914979757073, "calib/std_conf": 0.420668927470327, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8642918192918193, "calib/step_q_c_n": 1638.0, "calib/step_q_gap": 0.05602635926165345, "calib/step_q_w": 0.8082654600301659, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 765.5859375, "completions/mean_terminated_length": 783.9600219726562, "completions/min_length": 0.0, "completions/min_terminated_length": 365.0, "epoch": 0.17173333333333332, "grad_norm": 0.04432344064116478, "kl": 0.05446624755859375, "learning_rate": 1.0833333333333335e-06, "loss": -0.0982, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019417721778154373, "mask/share_reasoning": 0.8330705165863037, "mask/share_step_conf": 0.12407425045967102, "num_tokens": 50697437.0, "reward": 0.9070243239402771, "reward_std": 0.18110650777816772, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.7054632902145386, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7695227265357971, "step": 161 }, { "adv/mean_abs_final_conf": 0.562353253364563, "adv/mean_abs_reasoning": 0.401167631149292, "adv/mean_abs_step_conf": 0.7378664016723633, "adv/ratio_final_to_reasoning": 1.401791195749008, "adv/ratio_step_to_reasoning": 1.8392969531426901, "adv/std_final_conf": 0.7708464860916138, "adv/std_reasoning": 0.7014594078063965, "adv/std_step_conf": 0.9175775051116943, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6586058651926916, "calib/avg_num_step_conf": 8.72265625, "calib/ece": 0.24224081632653044, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6408163265306123, "calib/gap": 0.24810471364962405, "calib/mean_conf": 0.7205265306122449, "calib/mu_c": 0.7995149700598803, "calib/mu_w": 0.5514102564102562, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14056734693877534, "calib/std_conf": 0.39542936146468877, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8857506925207755, "calib/step_q_c_n": 1444.0, "calib/step_q_gap": 0.09616894347134586, "calib/step_q_w": 0.7895817490494297, "calib/step_q_w_n": 789.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2954.0, "completions/max_terminated_length": 2954.0, "completions/mean_length": 759.67578125, "completions/mean_terminated_length": 787.3563232421875, "completions/min_length": 0.0, "completions/min_terminated_length": 408.0, "epoch": 0.1728, "grad_norm": 0.11026094108819962, "kl": 0.053619384765625, "learning_rate": 1.0555555555555557e-06, "loss": -0.1468, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01879039779305458, "mask/share_reasoning": 0.8299312591552734, "mask/share_step_conf": 0.11612210422754288, "num_tokens": 50996058.0, "reward": 0.8966192007064819, "reward_std": 0.1913757026195526, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7013081908226013, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7700551152229309, "step": 162 }, { "adv/mean_abs_final_conf": 0.5956529378890991, "adv/mean_abs_reasoning": 0.4349844753742218, "adv/mean_abs_step_conf": 0.7498718500137329, "adv/ratio_final_to_reasoning": 1.3693659696168525, "adv/ratio_step_to_reasoning": 1.7239048574517748, "adv/std_final_conf": 0.8112250566482544, "adv/std_reasoning": 0.7014384269714355, "adv/std_step_conf": 0.9236242175102234, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6668549030029233, "calib/avg_num_step_conf": 8.484375, "calib/ece": 0.27717741935483864, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5967741935483871, "calib/gap": 0.2917618921073612, "calib/mean_conf": 0.6567741935483872, "calib/mu_c": 0.7814788732394367, "calib/mu_w": 0.48971698113207546, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1806854838709677, "calib/std_conf": 0.42956570038462133, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8771989966555185, "calib/step_q_c_n": 1196.0, "calib/step_q_gap": 0.04497563599978083, "calib/step_q_w": 0.8322233606557377, "calib/step_q_w_n": 976.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3052.0, "completions/max_terminated_length": 3052.0, "completions/mean_length": 802.96875, "completions/mean_terminated_length": 812.4901733398438, "completions/min_length": 0.0, "completions/min_terminated_length": 311.0, "epoch": 0.17386666666666667, "grad_norm": 0.04852801561355591, "kl": 0.05211639404296875, "learning_rate": 1.0277777777777777e-06, "loss": -0.0933, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01897309347987175, "mask/share_reasoning": 0.8520207405090332, "mask/share_step_conf": 0.11728739738464355, "num_tokens": 51306450.0, "reward": 0.8401877880096436, "reward_std": 0.2186070829629898, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6843827962875366, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.6913052797317505, "step": 163 }, { "adv/mean_abs_final_conf": 0.659756064414978, "adv/mean_abs_reasoning": 0.47427132725715637, "adv/mean_abs_step_conf": 0.7514228820800781, "adv/ratio_final_to_reasoning": 1.3910941406273318, "adv/ratio_step_to_reasoning": 1.5843734143191128, "adv/std_final_conf": 0.8602266311645508, "adv/std_reasoning": 0.7393752932548523, "adv/std_step_conf": 0.9299604296684265, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6942012745990616, "calib/avg_num_step_conf": 9.1875, "calib/ece": 0.28970833333333323, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.5916666666666667, "calib/gap": 0.24238882274669105, "calib/mean_conf": 0.727625, "calib/mu_c": 0.8377099236641222, "calib/mu_w": 0.5953211009174312, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23574999999999993, "calib/std_conf": 0.3616314090917251, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8610126582278481, "calib/step_q_c_n": 1106.0, "calib/step_q_gap": 0.12315551537070513, "calib/step_q_w": 0.737857142857143, "calib/step_q_w_n": 1246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2319.0, "completions/max_terminated_length": 2319.0, "completions/mean_length": 791.1484375, "completions/mean_terminated_length": 840.3900756835938, "completions/min_length": 0.0, "completions/min_terminated_length": 414.0, "epoch": 0.17493333333333333, "grad_norm": 0.04501301795244217, "kl": 0.0543060302734375, "learning_rate": 1.0000000000000002e-06, "loss": -0.276, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.016907479614019394, "mask/share_reasoning": 0.8249204158782959, "mask/share_step_conf": 0.0995783731341362, "num_tokens": 51615120.0, "reward": 0.8000724911689758, "reward_std": 0.2464374154806137, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6641730070114136, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.6453468203544617, "step": 164 }, { "adv/mean_abs_final_conf": 0.5247633457183838, "adv/mean_abs_reasoning": 0.43585866689682007, "adv/mean_abs_step_conf": 0.7637988328933716, "adv/ratio_final_to_reasoning": 1.2039759343425191, "adv/ratio_step_to_reasoning": 1.7524002409574297, "adv/std_final_conf": 0.7588238716125488, "adv/std_reasoning": 0.7014684677124023, "adv/std_step_conf": 0.930880069732666, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.593040293040293, "calib/avg_num_step_conf": 8.63671875, "calib/ece": 0.340040322580645, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7661290322580645, "calib/gap": 0.12131401931401908, "calib/mean_conf": 0.8384274193548388, "calib/mu_c": 0.8897902097902096, "calib/mu_w": 0.7684761904761905, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3009274193548386, "calib/std_conf": 0.30262085491390733, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8641282894736844, "calib/step_q_c_n": 1216.0, "calib/step_q_gap": 0.026801656307855137, "calib/step_q_w": 0.8373266331658292, "calib/step_q_w_n": 995.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2951.0, "completions/max_terminated_length": 2951.0, "completions/mean_length": 816.29296875, "completions/mean_terminated_length": 829.2500610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 427.0, "epoch": 0.176, "grad_norm": 0.05763980746269226, "kl": 0.0528717041015625, "learning_rate": 9.722222222222224e-07, "loss": -0.0403, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018184591084718704, "mask/share_reasoning": 0.8538507223129272, "mask/share_step_conf": 0.11233967542648315, "num_tokens": 51929667.0, "reward": 0.8078880310058594, "reward_std": 0.21158744394779205, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6345081925392151, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.6757991313934326, "step": 165 }, { "adv/mean_abs_final_conf": 0.5661165714263916, "adv/mean_abs_reasoning": 0.47203224897384644, "adv/mean_abs_step_conf": 0.7055524587631226, "adv/ratio_final_to_reasoning": 1.1993175734435002, "adv/ratio_step_to_reasoning": 1.4947124064021622, "adv/std_final_conf": 0.8220821619033813, "adv/std_reasoning": 0.7753757238388062, "adv/std_step_conf": 0.9330090284347534, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7565439219165928, "calib/avg_num_step_conf": 8.09375, "calib/ece": 0.21706122448979598, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6612244897959184, "calib/gap": 0.2990476190476189, "calib/mean_conf": 0.7803265306122449, "calib/mu_c": 0.8828571428571428, "calib/mu_w": 0.5838095238095239, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1701224489795919, "calib/std_conf": 0.32980126283808553, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.874810606060606, "calib/step_q_c_n": 1320.0, "calib/step_q_gap": 0.08522284010315928, "calib/step_q_w": 0.7895877659574467, "calib/step_q_w_n": 752.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2979.0, "completions/max_terminated_length": 2979.0, "completions/mean_length": 818.90234375, "completions/mean_terminated_length": 835.2151489257812, "completions/min_length": 0.0, "completions/min_terminated_length": 411.0, "epoch": 0.17706666666666668, "grad_norm": 0.03530353680253029, "kl": 0.0486907958984375, "learning_rate": 9.444444444444445e-07, "loss": -0.0762, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01848408207297325, "mask/share_reasoning": 0.8508272767066956, "mask/share_step_conf": 0.1111573576927185, "num_tokens": 52245490.0, "reward": 0.897564172744751, "reward_std": 0.22545835375785828, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7517530918121338, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7261877655982971, "step": 166 }, { "adv/mean_abs_final_conf": 0.5064303874969482, "adv/mean_abs_reasoning": 0.43117716908454895, "adv/mean_abs_step_conf": 0.7519251704216003, "adv/ratio_final_to_reasoning": 1.174529691755648, "adv/ratio_step_to_reasoning": 1.7438891117960755, "adv/std_final_conf": 0.7763898372650146, "adv/std_reasoning": 0.7205421924591064, "adv/std_step_conf": 0.9301944971084595, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.518722233975037, "calib/avg_num_step_conf": 8.34375, "calib/ece": 0.2897999999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.788, "calib/gap": 0.027499471123334174, "calib/mean_conf": 0.88724, "calib/mu_c": 0.8968098159509201, "calib/mu_w": 0.869310344827586, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2625199999999999, "calib/std_conf": 0.219036029912889, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.869957507082153, "calib/step_q_c_n": 1412.0, "calib/step_q_gap": 0.0055514297340868435, "calib/step_q_w": 0.8644060773480662, "calib/step_q_w_n": 724.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2842.0, "completions/max_terminated_length": 2842.0, "completions/mean_length": 770.41015625, "completions/mean_terminated_length": 776.4763793945312, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.17813333333333334, "grad_norm": 0.11281926184892654, "kl": 0.048736572265625, "learning_rate": 9.166666666666666e-07, "loss": -0.0473, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018992718309164047, "mask/share_reasoning": 0.8573728203773499, "mask/share_step_conf": 0.1158219575881958, "num_tokens": 52548323.0, "reward": 0.8557096123695374, "reward_std": 0.20991703867912292, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6662777662277222, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7217038869857788, "step": 167 }, { "adv/mean_abs_final_conf": 0.6514350175857544, "adv/mean_abs_reasoning": 0.5799128413200378, "adv/mean_abs_step_conf": 0.7642394304275513, "adv/ratio_final_to_reasoning": 1.1233326306465516, "adv/ratio_step_to_reasoning": 1.3178522287727523, "adv/std_final_conf": 0.8442649841308594, "adv/std_reasoning": 0.8100561499595642, "adv/std_step_conf": 0.9333626627922058, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6436970672639462, "calib/avg_num_step_conf": 8.70703125, "calib/ece": 0.2369758064516128, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7258064516129032, "calib/gap": 0.17539371456568909, "calib/mean_conf": 0.848508064516129, "calib/mu_c": 0.9128662420382165, "calib/mu_w": 0.7374725274725275, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22620967741935474, "calib/std_conf": 0.25611196505560024, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8747720125786164, "calib/step_q_c_n": 1272.0, "calib/step_q_gap": 0.11429134382208561, "calib/step_q_w": 0.7604806687565308, "calib/step_q_w_n": 957.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2408.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 828.46875, "completions/mean_terminated_length": 848.35205078125, "completions/min_length": 0.0, "completions/min_terminated_length": 434.0, "epoch": 0.1792, "grad_norm": 0.09699970483779907, "kl": 0.047489166259765625, "learning_rate": 8.88888888888889e-07, "loss": -0.0745, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01726042851805687, "mask/share_reasoning": 0.8543730974197388, "mask/share_step_conf": 0.10492902994155884, "num_tokens": 52865083.0, "reward": 0.8707404136657715, "reward_std": 0.28335040807724, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7139902114868164, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7118656635284424, "step": 168 }, { "adv/mean_abs_final_conf": 0.5136945247650146, "adv/mean_abs_reasoning": 0.4035590589046478, "adv/mean_abs_step_conf": 0.7587224841117859, "adv/ratio_final_to_reasoning": 1.2729104041408457, "adv/ratio_step_to_reasoning": 1.8800779399454774, "adv/std_final_conf": 0.77040696144104, "adv/std_reasoning": 0.6816996335983276, "adv/std_step_conf": 0.932590126991272, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6487183059348008, "calib/avg_num_step_conf": 8.1171875, "calib/ece": 0.28783673469387766, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.7510204081632653, "calib/gap": 0.13626288659793806, "calib/mean_conf": 0.8735510204081632, "calib/mu_c": 0.9274999999999999, "calib/mu_w": 0.7912371134020618, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2786530612244899, "calib/std_conf": 0.21838564104181507, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8780775444264943, "calib/step_q_c_n": 1238.0, "calib/step_q_gap": 0.05716087775982759, "calib/step_q_w": 0.8209166666666667, "calib/step_q_w_n": 840.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2687.0, "completions/max_terminated_length": 2687.0, "completions/mean_length": 827.3515625, "completions/mean_terminated_length": 843.8327026367188, "completions/min_length": 0.0, "completions/min_terminated_length": 446.0, "epoch": 0.18026666666666666, "grad_norm": 0.03865334019064903, "kl": 0.0500640869140625, "learning_rate": 8.611111111111112e-07, "loss": -0.0522, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01773287169635296, "mask/share_reasoning": 0.8558939099311829, "mask/share_step_conf": 0.10684196650981903, "num_tokens": 53181069.0, "reward": 0.846420168876648, "reward_std": 0.20503714680671692, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6753827929496765, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7104262709617615, "step": 169 }, { "adv/mean_abs_final_conf": 0.5498205423355103, "adv/mean_abs_reasoning": 0.4751802682876587, "adv/mean_abs_step_conf": 0.7622960209846497, "adv/ratio_final_to_reasoning": 1.1570778061067695, "adv/ratio_step_to_reasoning": 1.6042249054903526, "adv/std_final_conf": 0.7767378091812134, "adv/std_reasoning": 0.7208435535430908, "adv/std_step_conf": 0.929639995098114, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6231788564316568, "calib/avg_num_step_conf": 8.61328125, "calib/ece": 0.2904918032786884, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.8401639344262295, "calib/gap": 0.12844424921297326, "calib/mean_conf": 0.8982786885245901, "calib/mu_c": 0.944076433121019, "calib/mu_w": 0.8156321839080457, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2726639344262294, "calib/std_conf": 0.23366885853159886, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8714438502673797, "calib/step_q_c_n": 1309.0, "calib/step_q_gap": 0.04695724312452254, "calib/step_q_w": 0.8244866071428572, "calib/step_q_w_n": 896.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 811.78515625, "completions/mean_terminated_length": 831.26806640625, "completions/min_length": 0.0, "completions/min_terminated_length": 424.0, "epoch": 0.18133333333333335, "grad_norm": 0.0797303095459938, "kl": 0.0508575439453125, "learning_rate": 8.333333333333333e-07, "loss": -0.0568, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01804276928305626, "mask/share_reasoning": 0.8481639623641968, "mask/share_step_conf": 0.11035574227571487, "num_tokens": 53493038.0, "reward": 0.8397831916809082, "reward_std": 0.2692488431930542, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.676689863204956, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6895952224731445, "step": 170 }, { "adv/mean_abs_final_conf": 0.6254308223724365, "adv/mean_abs_reasoning": 0.4784793257713318, "adv/mean_abs_step_conf": 0.7460007667541504, "adv/ratio_final_to_reasoning": 1.307121935444571, "adv/ratio_step_to_reasoning": 1.5591076282168745, "adv/std_final_conf": 0.8441042304039001, "adv/std_reasoning": 0.7392579913139343, "adv/std_step_conf": 0.9327840805053711, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6507320735827806, "calib/avg_num_step_conf": 8.58984375, "calib/ece": 0.34754940711462456, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7351778656126482, "calib/gap": 0.16773995745213388, "calib/mean_conf": 0.8530830039525692, "calib/mu_c": 0.9339694656488552, "calib/mu_w": 0.7662295081967213, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3414229249011858, "calib/std_conf": 0.25475862603103616, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8812911843276937, "calib/step_q_c_n": 1123.0, "calib/step_q_gap": 0.06497148172546308, "calib/step_q_w": 0.8163197026022306, "calib/step_q_w_n": 1076.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2273.0, "completions/max_terminated_length": 2273.0, "completions/mean_length": 778.046875, "completions/mean_terminated_length": 787.2727661132812, "completions/min_length": 0.0, "completions/min_terminated_length": 444.0, "epoch": 0.1824, "grad_norm": 0.06779231131076813, "kl": 0.052455902099609375, "learning_rate": 8.055555555555557e-07, "loss": -0.0545, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01890362799167633, "mask/share_reasoning": 0.8550175428390503, "mask/share_step_conf": 0.11436008661985397, "num_tokens": 53799114.0, "reward": 0.8081221580505371, "reward_std": 0.22378191351890564, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6490582227706909, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.6671860218048096, "step": 171 }, { "adv/mean_abs_final_conf": 0.4872409701347351, "adv/mean_abs_reasoning": 0.3581220209598541, "adv/mean_abs_step_conf": 0.751218318939209, "adv/ratio_final_to_reasoning": 1.3605445675437964, "adv/ratio_step_to_reasoning": 2.097660224651255, "adv/std_final_conf": 0.7403228282928467, "adv/std_reasoning": 0.6403510570526123, "adv/std_step_conf": 0.9275820255279541, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6201541425818882, "calib/avg_num_step_conf": 7.93359375, "calib/ece": 0.2174193548387097, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8185483870967742, "calib/gap": 0.11301579961464348, "calib/mean_conf": 0.9113709677419355, "calib/mu_c": 0.9455491329479768, "calib/mu_w": 0.8325333333333333, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21560483870967742, "calib/std_conf": 0.1761841717718548, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8880857142857144, "calib/step_q_c_n": 1400.0, "calib/step_q_gap": 0.038529454380801686, "calib/step_q_w": 0.8495562599049127, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2541.0, "completions/max_terminated_length": 2541.0, "completions/mean_length": 752.23046875, "completions/mean_terminated_length": 761.1502075195312, "completions/min_length": 0.0, "completions/min_terminated_length": 374.0, "epoch": 0.18346666666666667, "grad_norm": 0.038075245916843414, "kl": 0.0482177734375, "learning_rate": 7.777777777777779e-07, "loss": -0.0529, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.019558429718017578, "mask/share_reasoning": 0.853873610496521, "mask/share_step_conf": 0.11484922468662262, "num_tokens": 54095037.0, "reward": 0.9087539911270142, "reward_std": 0.18010839819908142, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7362257838249207, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7523757815361023, "step": 172 }, { "adv/mean_abs_final_conf": 0.5600264072418213, "adv/mean_abs_reasoning": 0.4762570261955261, "adv/mean_abs_step_conf": 0.7568649053573608, "adv/ratio_final_to_reasoning": 1.1758911185320842, "adv/ratio_step_to_reasoning": 1.5891942033977087, "adv/std_final_conf": 0.811123788356781, "adv/std_reasoning": 0.7392693161964417, "adv/std_step_conf": 0.9293073415756226, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5953743603555077, "calib/avg_num_step_conf": 8.61328125, "calib/ece": 0.2741269841269841, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7896825396825397, "calib/gap": 0.1049555615405332, "calib/mean_conf": 0.8935714285714286, "calib/mu_c": 0.9327215189873416, "calib/mu_w": 0.8277659574468084, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.27035714285714285, "calib/std_conf": 0.20683734996949568, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8779779411764708, "calib/step_q_c_n": 1360.0, "calib/step_q_gap": 0.016664331709015157, "calib/step_q_w": 0.8613136094674556, "calib/step_q_w_n": 845.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2521.0, "completions/max_terminated_length": 2521.0, "completions/mean_length": 797.02734375, "completions/mean_terminated_length": 800.1530151367188, "completions/min_length": 0.0, "completions/min_terminated_length": 265.0, "epoch": 0.18453333333333333, "grad_norm": 0.038408100605010986, "kl": 0.04986572265625, "learning_rate": 7.5e-07, "loss": -0.0281, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01891019381582737, "mask/share_reasoning": 0.8586477041244507, "mask/share_step_conf": 0.11853589117527008, "num_tokens": 54402236.0, "reward": 0.8644517660140991, "reward_std": 0.23591762781143188, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6904085874557495, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.718182384967804, "step": 173 }, { "adv/mean_abs_final_conf": 0.6717838644981384, "adv/mean_abs_reasoning": 0.6233100891113281, "adv/mean_abs_step_conf": 0.7425686717033386, "adv/ratio_final_to_reasoning": 1.077768315054744, "adv/ratio_step_to_reasoning": 1.1913310640648238, "adv/std_final_conf": 0.8761308789253235, "adv/std_reasoning": 0.8590900301933289, "adv/std_step_conf": 0.9354785680770874, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5538930084745762, "calib/avg_num_step_conf": 8.7890625, "calib/ece": 0.34117886178861795, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6910569105691057, "calib/gap": 0.05903336864406794, "calib/mean_conf": 0.8480894308943089, "calib/mu_c": 0.87640625, "calib/mu_w": 0.8173728813559321, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3344715447154472, "calib/std_conf": 0.22910396362450083, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.864655013799448, "calib/step_q_c_n": 1087.0, "calib/step_q_gap": 0.04448031500426719, "calib/step_q_w": 0.8201746987951808, "calib/step_q_w_n": 1162.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2326.0, "completions/max_terminated_length": 2326.0, "completions/mean_length": 841.515625, "completions/mean_terminated_length": 861.7120361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.1856, "grad_norm": 0.0463547483086586, "kl": 0.04796600341796875, "learning_rate": 7.222222222222222e-07, "loss": -0.1683, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01758679747581482, "mask/share_reasoning": 0.8539252281188965, "mask/share_step_conf": 0.1050504595041275, "num_tokens": 54721896.0, "reward": 0.7721678018569946, "reward_std": 0.29204386472702026, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.590755820274353, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6621733903884888, "step": 174 }, { "adv/mean_abs_final_conf": 0.5728576183319092, "adv/mean_abs_reasoning": 0.4325658082962036, "adv/mean_abs_step_conf": 0.7267290353775024, "adv/ratio_final_to_reasoning": 1.3243247786696062, "adv/ratio_step_to_reasoning": 1.6800427159047848, "adv/std_final_conf": 0.7944455742835999, "adv/std_reasoning": 0.7205823063850403, "adv/std_step_conf": 0.9327433109283447, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6657325338894682, "calib/avg_num_step_conf": 8.484375, "calib/ece": 0.34783132530120486, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6184738955823293, "calib/gap": 0.19052593847758081, "calib/mean_conf": 0.7836546184738957, "calib/mu_c": 0.8884821428571429, "calib/mu_w": 0.6979562043795621, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.340843373493976, "calib/std_conf": 0.294405495144094, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8670323928944619, "calib/step_q_c_n": 957.0, "calib/step_q_gap": 0.04722169330598447, "calib/step_q_w": 0.8198106995884774, "calib/step_q_w_n": 1215.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1987.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 828.51953125, "completions/mean_terminated_length": 841.670654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 400.0, "epoch": 0.18666666666666668, "grad_norm": 0.036516882479190826, "kl": 0.052318572998046875, "learning_rate": 6.944444444444446e-07, "loss": -0.0844, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01774880848824978, "mask/share_reasoning": 0.856615424156189, "mask/share_step_conf": 0.11001075804233551, "num_tokens": 55039821.0, "reward": 0.7852064967155457, "reward_std": 0.21699534356594086, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.6309511661529541, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6574305295944214, "step": 175 }, { "adv/mean_abs_final_conf": 0.5725212097167969, "adv/mean_abs_reasoning": 0.4553416967391968, "adv/mean_abs_step_conf": 0.7647500038146973, "adv/ratio_final_to_reasoning": 1.2573441303898778, "adv/ratio_step_to_reasoning": 1.6795079591683393, "adv/std_final_conf": 0.8109849095344543, "adv/std_reasoning": 0.7206153869628906, "adv/std_step_conf": 0.9289859533309937, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6485102109139605, "calib/avg_num_step_conf": 8.4296875, "calib/ece": 0.23387096774193544, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6169354838709677, "calib/gap": 0.16042986273853388, "calib/mean_conf": 0.8108870967741936, "calib/mu_c": 0.8775172413793104, "calib/mu_w": 0.7170873786407765, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23004032258064513, "calib/std_conf": 0.25536821011381305, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8757729468599034, "calib/step_q_c_n": 1242.0, "calib/step_q_gap": 0.06768342720924836, "calib/step_q_w": 0.808089519650655, "calib/step_q_w_n": 916.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2603.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 777.57421875, "completions/mean_terminated_length": 789.9166870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 94.0, "epoch": 0.18773333333333334, "grad_norm": 0.03444397449493408, "kl": 0.052021026611328125, "learning_rate": 6.666666666666667e-07, "loss": -0.1408, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018836980685591698, "mask/share_reasoning": 0.8486654162406921, "mask/share_step_conf": 0.11687257885932922, "num_tokens": 55342944.0, "reward": 0.8689711689949036, "reward_std": 0.2312285155057907, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6962413787841797, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.73466956615448, "step": 176 }, { "adv/mean_abs_final_conf": 0.5899331569671631, "adv/mean_abs_reasoning": 0.37707242369651794, "adv/mean_abs_step_conf": 0.7464255690574646, "adv/ratio_final_to_reasoning": 1.5645088844841208, "adv/ratio_step_to_reasoning": 1.9795283933524026, "adv/std_final_conf": 0.8274363875389099, "adv/std_reasoning": 0.6814361810684204, "adv/std_step_conf": 0.9328826069831848, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.774943167466532, "calib/avg_num_step_conf": 8.5546875, "calib/ece": 0.2322352941176469, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6235294117647059, "calib/gap": 0.26361707501894405, "calib/mean_conf": 0.8126274509803922, "calib/mu_c": 0.9232432432432431, "calib/mu_w": 0.6596261682242991, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2322352941176469, "calib/std_conf": 0.24333741286811122, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8775059382422803, "calib/step_q_c_n": 1263.0, "calib/step_q_gap": 0.05978209789708078, "calib/step_q_w": 0.8177238403451995, "calib/step_q_w_n": 927.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1907.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 781.00390625, "completions/mean_terminated_length": 784.0667114257812, "completions/min_length": 0.0, "completions/min_terminated_length": 336.0, "epoch": 0.1888, "grad_norm": 0.05831204727292061, "kl": 0.05739593505859375, "learning_rate": 6.388888888888889e-07, "loss": -0.0203, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.019267741590738297, "mask/share_reasoning": 0.857624888420105, "mask/share_step_conf": 0.11920113116502762, "num_tokens": 55646713.0, "reward": 0.8887894153594971, "reward_std": 0.18345263600349426, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7657734155654907, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.6977428197860718, "step": 177 }, { "adv/mean_abs_final_conf": 0.6337563991546631, "adv/mean_abs_reasoning": 0.4424035847187042, "adv/mean_abs_step_conf": 0.7219555377960205, "adv/ratio_final_to_reasoning": 1.4325299817758659, "adv/ratio_step_to_reasoning": 1.6318935079494559, "adv/std_final_conf": 0.8438934683799744, "adv/std_reasoning": 0.7206499576568604, "adv/std_step_conf": 0.92996746301651, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.749377141077546, "calib/avg_num_step_conf": 8.71875, "calib/ece": 0.13142857142857134, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.636734693877551, "calib/gap": 0.27502257863593904, "calib/mean_conf": 0.8103673469387754, "calib/mu_c": 0.8956804733727811, "calib/mu_w": 0.6206578947368421, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12599999999999992, "calib/std_conf": 0.2704060962570209, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8815164220824597, "calib/step_q_c_n": 1431.0, "calib/step_q_gap": 0.06092965554063701, "calib/step_q_w": 0.8205867665418227, "calib/step_q_w_n": 801.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2906.0, "completions/max_terminated_length": 2906.0, "completions/mean_length": 802.984375, "completions/mean_terminated_length": 822.2560424804688, "completions/min_length": 0.0, "completions/min_terminated_length": 400.0, "epoch": 0.18986666666666666, "grad_norm": 0.11171025782823563, "kl": 0.0521392822265625, "learning_rate": 6.111111111111112e-07, "loss": -0.0644, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018252698704600334, "mask/share_reasoning": 0.8460112810134888, "mask/share_step_conf": 0.11229851096868515, "num_tokens": 55958349.0, "reward": 0.9219865798950195, "reward_std": 0.21582040190696716, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.780997633934021, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7395378947257996, "step": 178 }, { "adv/mean_abs_final_conf": 0.6373348236083984, "adv/mean_abs_reasoning": 0.5426758527755737, "adv/mean_abs_step_conf": 0.7440859079360962, "adv/ratio_final_to_reasoning": 1.1744300402324541, "adv/ratio_step_to_reasoning": 1.3711424677003576, "adv/std_final_conf": 0.8385090231895447, "adv/std_reasoning": 0.7928795218467712, "adv/std_step_conf": 0.9288709163665771, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.7027412280701755, "calib/avg_num_step_conf": 8.47265625, "calib/ece": 0.255481171548117, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.698744769874477, "calib/gap": 0.18927339181286595, "calib/mean_conf": 0.8504602510460251, "calib/mu_c": 0.9256944444444445, "calib/mu_w": 0.7364210526315785, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25171548117154796, "calib/std_conf": 0.22995587508686588, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8779867986798678, "calib/step_q_c_n": 1212.0, "calib/step_q_gap": 0.07686872135489387, "calib/step_q_w": 0.801118077324974, "calib/step_q_w_n": 957.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3064.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 798.48046875, "completions/mean_terminated_length": 834.33056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 408.0, "epoch": 0.19093333333333334, "grad_norm": 0.042912401258945465, "kl": 0.0484619140625, "learning_rate": 5.833333333333334e-07, "loss": -0.1355, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.018003320321440697, "mask/share_reasoning": 0.829845130443573, "mask/share_step_conf": 0.10918280482292175, "num_tokens": 56269024.0, "reward": 0.8372169733047485, "reward_std": 0.2432376891374588, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6839832067489624, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.6920132040977478, "step": 179 }, { "adv/mean_abs_final_conf": 0.5640825033187866, "adv/mean_abs_reasoning": 0.4570828080177307, "adv/mean_abs_step_conf": 0.7516995668411255, "adv/ratio_final_to_reasoning": 1.2340925832784884, "adv/ratio_step_to_reasoning": 1.6445588275373646, "adv/std_final_conf": 0.8109744787216187, "adv/std_reasoning": 0.757621169090271, "adv/std_step_conf": 0.9324401617050171, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6544264874141876, "calib/avg_num_step_conf": 9.0859375, "calib/ece": 0.207827868852459, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6229508196721312, "calib/gap": 0.16543192219679614, "calib/mean_conf": 0.8147950819672132, "calib/mu_c": 0.8771710526315788, "calib/mu_w": 0.7117391304347827, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19983606557377048, "calib/std_conf": 0.2515435118154883, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8635598455598456, "calib/step_q_c_n": 1295.0, "calib/step_q_gap": 0.08801183392066025, "calib/step_q_w": 0.7755480116391853, "calib/step_q_w_n": 1031.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2856.0, "completions/max_terminated_length": 2856.0, "completions/mean_length": 850.9765625, "completions/mean_terminated_length": 867.9282836914062, "completions/min_length": 0.0, "completions/min_terminated_length": 390.0, "epoch": 0.192, "grad_norm": 0.027483295649290085, "kl": 0.0508270263671875, "learning_rate": 5.555555555555555e-07, "loss": -0.0867, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017159832641482353, "mask/share_reasoning": 0.8568102121353149, "mask/share_step_conf": 0.10649865865707397, "num_tokens": 56590730.0, "reward": 0.8719885349273682, "reward_std": 0.22523637115955353, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7079362869262695, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7266656756401062, "step": 180 }, { "adv/mean_abs_final_conf": 0.6659194827079773, "adv/mean_abs_reasoning": 0.5319613218307495, "adv/mean_abs_step_conf": 0.7389317750930786, "adv/ratio_final_to_reasoning": 1.251819362385614, "adv/ratio_step_to_reasoning": 1.3890704920990091, "adv/std_final_conf": 0.8759973049163818, "adv/std_reasoning": 0.7927730679512024, "adv/std_step_conf": 0.9336047768592834, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6733273462056579, "calib/avg_num_step_conf": 8.6640625, "calib/ece": 0.2990799999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.628, "calib/gap": 0.17728718968503476, "calib/mean_conf": 0.81668, "calib/mu_c": 0.9010687022900765, "calib/mu_w": 0.7237815126050418, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2958799999999999, "calib/std_conf": 0.2502538263443738, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8699122036874452, "calib/step_q_c_n": 1139.0, "calib/step_q_gap": 0.04246085985055925, "calib/step_q_w": 0.8274513438368859, "calib/step_q_w_n": 1079.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2591.0, "completions/max_terminated_length": 2591.0, "completions/mean_length": 791.1953125, "completions/mean_terminated_length": 803.7540283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 422.0, "epoch": 0.19306666666666666, "grad_norm": 0.05116986483335495, "kl": 0.05887603759765625, "learning_rate": 5.277777777777779e-07, "loss": -0.0833, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018377797678112984, "mask/share_reasoning": 0.852816104888916, "mask/share_step_conf": 0.11318107694387436, "num_tokens": 56899540.0, "reward": 0.8266412019729614, "reward_std": 0.24478717148303986, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6745378971099854, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.6810882091522217, "step": 181 }, { "adv/mean_abs_final_conf": 0.5257282257080078, "adv/mean_abs_reasoning": 0.3534095287322998, "adv/mean_abs_step_conf": 0.7751988172531128, "adv/ratio_final_to_reasoning": 1.4875892780645306, "adv/ratio_step_to_reasoning": 2.1934858973208655, "adv/std_final_conf": 0.776384711265564, "adv/std_reasoning": 0.6401922702789307, "adv/std_step_conf": 0.9291584491729736, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.709341755319149, "calib/avg_num_step_conf": 8.46484375, "calib/ece": 0.23803149606299207, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7047244094488189, "calib/gap": 0.19531781914893642, "calib/mean_conf": 0.8537795275590552, "calib/mu_c": 0.9260625000000001, "calib/mu_w": 0.7307446808510637, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2309448818897637, "calib/std_conf": 0.23002714366998875, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8782753515914139, "calib/step_q_c_n": 1351.0, "calib/step_q_gap": 0.06087339080710019, "calib/step_q_w": 0.8174019607843137, "calib/step_q_w_n": 816.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 754.671875, "completions/mean_terminated_length": 760.6141967773438, "completions/min_length": 0.0, "completions/min_terminated_length": 461.0, "epoch": 0.19413333333333332, "grad_norm": 0.0377509668469429, "kl": 0.0584564208984375, "learning_rate": 5.000000000000001e-07, "loss": -0.0351, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019371334463357925, "mask/share_reasoning": 0.8548128604888916, "mask/share_step_conf": 0.11800329387187958, "num_tokens": 57198896.0, "reward": 0.8838473558425903, "reward_std": 0.1849229335784912, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7490218877792358, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.6952352523803711, "step": 182 }, { "adv/mean_abs_final_conf": 0.6666151881217957, "adv/mean_abs_reasoning": 0.5118650197982788, "adv/mean_abs_step_conf": 0.7944930791854858, "adv/ratio_final_to_reasoning": 1.3023261257127952, "adv/ratio_step_to_reasoning": 1.5521534944868631, "adv/std_final_conf": 0.8759217858314514, "adv/std_reasoning": 0.7394871711730957, "adv/std_step_conf": 0.9344072341918945, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7245745943806885, "calib/avg_num_step_conf": 8.2578125, "calib/ece": 0.2508906882591092, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6072874493927125, "calib/gap": 0.2473934837092734, "calib/mean_conf": 0.7893522267206476, "calib/mu_c": 0.9035338345864663, "calib/mu_w": 0.6561403508771929, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2508906882591092, "calib/std_conf": 0.27891124614075113, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8831876138433515, "calib/step_q_c_n": 1098.0, "calib/step_q_gap": 0.10113052722917837, "calib/step_q_w": 0.7820570866141732, "calib/step_q_w_n": 1016.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2034.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 830.46875, "completions/mean_terminated_length": 850.4000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 351.0, "epoch": 0.1952, "grad_norm": 0.07475608587265015, "kl": 0.05438232421875, "learning_rate": 4.7222222222222226e-07, "loss": -0.1073, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.0174814835190773, "mask/share_reasoning": 0.8535329103469849, "mask/share_step_conf": 0.10554810613393784, "num_tokens": 57518176.0, "reward": 0.8417513370513916, "reward_std": 0.26017308235168457, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7079120874404907, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6787155866622925, "step": 183 }, { "adv/mean_abs_final_conf": 0.6304896473884583, "adv/mean_abs_reasoning": 0.4839908480644226, "adv/mean_abs_step_conf": 0.7729167938232422, "adv/ratio_final_to_reasoning": 1.302689193214942, "adv/ratio_step_to_reasoning": 1.5969657213856272, "adv/std_final_conf": 0.8440559506416321, "adv/std_reasoning": 0.7576839923858643, "adv/std_step_conf": 0.9316712617874146, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6616161616161615, "calib/avg_num_step_conf": 9.109375, "calib/ece": 0.194320987654321, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7037037037037037, "calib/gap": 0.17838461538461503, "calib/mean_conf": 0.8474074074074074, "calib/mu_c": 0.9046666666666665, "calib/mu_w": 0.7262820512820515, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18135802469135803, "calib/std_conf": 0.24131725840558851, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8664727540500737, "calib/step_q_c_n": 1358.0, "calib/step_q_gap": 0.07795119347512514, "calib/step_q_w": 0.7885215605749486, "calib/step_q_w_n": 974.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2701.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 785.9140625, "completions/mean_terminated_length": 817.8617553710938, "completions/min_length": 0.0, "completions/min_terminated_length": 444.0, "epoch": 0.19626666666666667, "grad_norm": 0.054167453199625015, "kl": 0.053737640380859375, "learning_rate": 4.444444444444445e-07, "loss": -0.1561, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.018033988773822784, "mask/share_reasoning": 0.8329838514328003, "mask/share_step_conf": 0.10991968959569931, "num_tokens": 57824650.0, "reward": 0.894181489944458, "reward_std": 0.2465510070323944, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7339491844177246, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.735663652420044, "step": 184 }, { "adv/mean_abs_final_conf": 0.5086137056350708, "adv/mean_abs_reasoning": 0.39162346720695496, "adv/mean_abs_step_conf": 0.7781357169151306, "adv/ratio_final_to_reasoning": 1.2987314301221176, "adv/ratio_step_to_reasoning": 1.9869486434629868, "adv/std_final_conf": 0.7587011456489563, "adv/std_reasoning": 0.681826114654541, "adv/std_step_conf": 0.9302533864974976, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7149665670792431, "calib/avg_num_step_conf": 9.171875, "calib/ece": 0.26950207468879667, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.7053941908713693, "calib/gap": 0.21292360221937678, "calib/mean_conf": 0.843941908713693, "calib/mu_c": 0.9314084507042251, "calib/mu_w": 0.7184848484848483, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2621161825726141, "calib/std_conf": 0.24912923196358036, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8771261487050961, "calib/step_q_c_n": 1197.0, "calib/step_q_gap": 0.14506706964341065, "calib/step_q_w": 0.7320590790616854, "calib/step_q_w_n": 1151.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2567.0, "completions/max_terminated_length": 2567.0, "completions/mean_length": 786.78125, "completions/mean_terminated_length": 828.8723754882812, "completions/min_length": 0.0, "completions/min_terminated_length": 464.0, "epoch": 0.19733333333333333, "grad_norm": 0.045686863362789154, "kl": 0.04729461669921875, "learning_rate": 4.1666666666666667e-07, "loss": -0.2002, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01783943921327591, "mask/share_reasoning": 0.8219054937362671, "mask/share_step_conf": 0.10947383940219879, "num_tokens": 58132986.0, "reward": 0.833606481552124, "reward_std": 0.2180936634540558, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6881363391876221, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.6806391477584839, "step": 185 }, { "adv/mean_abs_final_conf": 0.6431267857551575, "adv/mean_abs_reasoning": 0.5003708600997925, "adv/mean_abs_step_conf": 0.787868857383728, "adv/ratio_final_to_reasoning": 1.2853002383609913, "adv/ratio_step_to_reasoning": 1.5745698245229505, "adv/std_final_conf": 0.8440446853637695, "adv/std_reasoning": 0.7578926682472229, "adv/std_step_conf": 0.9339975118637085, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.6725707782045811, "calib/avg_num_step_conf": 8.6796875, "calib/ece": 0.2716182572614107, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6597510373443983, "calib/gap": 0.17574619433774386, "calib/mean_conf": 0.8106224066390042, "calib/mu_c": 0.8828169014084508, "calib/mu_w": 0.707070707070707, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24651452282157665, "calib/std_conf": 0.2773631518500008, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8730307813733229, "calib/step_q_c_n": 1267.0, "calib/step_q_gap": 0.09363811121625487, "calib/step_q_w": 0.779392670157068, "calib/step_q_w_n": 955.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 814.22265625, "completions/mean_terminated_length": 840.4878540039062, "completions/min_length": 0.0, "completions/min_terminated_length": 357.0, "epoch": 0.1984, "grad_norm": 0.05286160111427307, "kl": 0.05419921875, "learning_rate": 3.8888888888888895e-07, "loss": -0.1288, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01790950819849968, "mask/share_reasoning": 0.8364917039871216, "mask/share_step_conf": 0.11434879899024963, "num_tokens": 58446467.0, "reward": 0.8335369825363159, "reward_std": 0.25954774022102356, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6725648641586304, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.6960716247558594, "step": 186 }, { "adv/mean_abs_final_conf": 0.6640216708183289, "adv/mean_abs_reasoning": 0.5353648662567139, "adv/mean_abs_step_conf": 0.779157280921936, "adv/ratio_final_to_reasoning": 1.240316114617657, "adv/ratio_step_to_reasoning": 1.4553761930061373, "adv/std_final_conf": 0.8601834774017334, "adv/std_reasoning": 0.7929431796073914, "adv/std_step_conf": 0.9341410398483276, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7007133152173914, "calib/avg_num_step_conf": 9.73828125, "calib/ece": 0.28650205761316866, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6090534979423868, "calib/gap": 0.19911209239130456, "calib/mean_conf": 0.8013168724279836, "calib/mu_c": 0.8955468750000001, "calib/mu_w": 0.6964347826086955, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28053497942386824, "calib/std_conf": 0.25901664786592343, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8671660649819495, "calib/step_q_c_n": 1108.0, "calib/step_q_gap": 0.08116606498194934, "calib/step_q_w": 0.7860000000000001, "calib/step_q_w_n": 1385.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2983.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 866.58984375, "completions/mean_terminated_length": 887.3880615234375, "completions/min_length": 0.0, "completions/min_terminated_length": 468.0, "epoch": 0.19946666666666665, "grad_norm": 0.08439423888921738, "kl": 0.05120849609375, "learning_rate": 3.611111111111111e-07, "loss": -0.0256, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01723765954375267, "mask/share_reasoning": 0.8504993915557861, "mask/share_step_conf": 0.1088254451751709, "num_tokens": 58769858.0, "reward": 0.8159067034721375, "reward_std": 0.25626736879348755, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6715812683105469, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6703883409500122, "step": 187 }, { "adv/mean_abs_final_conf": 0.6788389086723328, "adv/mean_abs_reasoning": 0.39350107312202454, "adv/mean_abs_step_conf": 0.772308349609375, "adv/ratio_final_to_reasoning": 1.7251259400297114, "adv/ratio_step_to_reasoning": 1.9626588143252215, "adv/std_final_conf": 0.8910698890686035, "adv/std_reasoning": 0.6816974878311157, "adv/std_step_conf": 0.9345043897628784, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7079327778964609, "calib/avg_num_step_conf": 8.54296875, "calib/ece": 0.19024590163934435, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6147540983606558, "calib/gap": 0.23190201523890885, "calib/mean_conf": 0.7911475409836065, "calib/mu_c": 0.8795364238410595, "calib/mu_w": 0.6476344086021506, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18127049180327878, "calib/std_conf": 0.2817820827626987, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8599916387959866, "calib/step_q_c_n": 1196.0, "calib/step_q_gap": 0.05549113425511876, "calib/step_q_w": 0.8045005045408679, "calib/step_q_w_n": 991.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2889.0, "completions/max_terminated_length": 2889.0, "completions/mean_length": 838.92578125, "completions/mean_terminated_length": 862.510009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 374.0, "epoch": 0.20053333333333334, "grad_norm": 0.058961011469364166, "kl": 0.05727386474609375, "learning_rate": 3.3333333333333335e-07, "loss": -0.1537, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017490971833467484, "mask/share_reasoning": 0.8490737080574036, "mask/share_step_conf": 0.10609160363674164, "num_tokens": 59088695.0, "reward": 0.8640185594558716, "reward_std": 0.23061273992061615, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7286055088043213, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6908378601074219, "step": 188 }, { "adv/mean_abs_final_conf": 0.6422284245491028, "adv/mean_abs_reasoning": 0.4257098436355591, "adv/mean_abs_step_conf": 0.7781589031219482, "adv/ratio_final_to_reasoning": 1.5086060004261976, "adv/ratio_step_to_reasoning": 1.8279091140493173, "adv/std_final_conf": 0.8458375334739685, "adv/std_reasoning": 0.6818082332611084, "adv/std_step_conf": 0.9307503700256348, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7350993377483444, "calib/avg_num_step_conf": 8.56640625, "calib/ece": 0.18626016260162592, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5447154471544715, "calib/gap": 0.25046357615894055, "calib/mean_conf": 0.753739837398374, "calib/mu_c": 0.8504635761589405, "calib/mu_w": 0.6, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16308943089430886, "calib/std_conf": 0.2916853142674879, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8517623918174665, "calib/step_q_c_n": 1271.0, "calib/step_q_gap": 0.07708777142701095, "calib/step_q_w": 0.7746746203904555, "calib/step_q_w_n": 922.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3007.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 779.9453125, "completions/mean_terminated_length": 805.1047973632812, "completions/min_length": 0.0, "completions/min_terminated_length": 376.0, "epoch": 0.2016, "grad_norm": 0.067589171230793, "kl": 0.059906005859375, "learning_rate": 3.055555555555556e-07, "loss": -0.162, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018148701637983322, "mask/share_reasoning": 0.8403629064559937, "mask/share_step_conf": 0.11023841053247452, "num_tokens": 59396129.0, "reward": 0.8732390403747559, "reward_std": 0.2376517951488495, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.742780864238739, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6943222284317017, "step": 189 }, { "adv/mean_abs_final_conf": 0.6645102500915527, "adv/mean_abs_reasoning": 0.48422133922576904, "adv/mean_abs_step_conf": 0.7489802837371826, "adv/ratio_final_to_reasoning": 1.3723274797307594, "adv/ratio_step_to_reasoning": 1.5467725667248409, "adv/std_final_conf": 0.8758800625801086, "adv/std_reasoning": 0.7576428055763245, "adv/std_step_conf": 0.9346054196357727, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.652066647597254, "calib/avg_num_step_conf": 8.93359375, "calib/ece": 0.20122950819672128, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5901639344262295, "calib/gap": 0.17271453089244837, "calib/mean_conf": 0.7770491803278687, "calib/mu_c": 0.8421710526315789, "calib/mu_w": 0.6694565217391305, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1776639344262295, "calib/std_conf": 0.29038185296625374, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8654778554778556, "calib/step_q_c_n": 1287.0, "calib/step_q_gap": 0.06381785547785568, "calib/step_q_w": 0.8016599999999999, "calib/step_q_w_n": 1000.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2796.0, "completions/max_terminated_length": 2796.0, "completions/mean_length": 830.8671875, "completions/mean_terminated_length": 844.0556030273438, "completions/min_length": 0.0, "completions/min_terminated_length": 428.0, "epoch": 0.20266666666666666, "grad_norm": 0.06410624831914902, "kl": 0.05812835693359375, "learning_rate": 2.7777777777777776e-07, "loss": -0.0981, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01752610132098198, "mask/share_reasoning": 0.8577367663383484, "mask/share_step_conf": 0.10911212116479874, "num_tokens": 59714439.0, "reward": 0.8567715883255005, "reward_std": 0.2456488162279129, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7035819888114929, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6998047828674316, "step": 190 }, { "adv/mean_abs_final_conf": 0.6169793605804443, "adv/mean_abs_reasoning": 0.37427815794944763, "adv/mean_abs_step_conf": 0.7451186180114746, "adv/ratio_final_to_reasoning": 1.6484514190212978, "adv/ratio_step_to_reasoning": 1.9908151255572735, "adv/std_final_conf": 0.8437768816947937, "adv/std_reasoning": 0.6816237568855286, "adv/std_step_conf": 0.933275043964386, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7011305241521069, "calib/avg_num_step_conf": 9.58203125, "calib/ece": 0.27098360655737697, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6352459016393442, "calib/gap": 0.19729359369647137, "calib/mean_conf": 0.8028688524590163, "calib/mu_c": 0.8877697841726618, "calib/mu_w": 0.6904761904761905, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2520901639344262, "calib/std_conf": 0.2717203224272977, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8665474209650583, "calib/step_q_c_n": 1202.0, "calib/step_q_gap": 0.0922228805973524, "calib/step_q_w": 0.7743245403677059, "calib/step_q_w_n": 1251.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2811.0, "completions/max_terminated_length": 2811.0, "completions/mean_length": 791.0625, "completions/mean_terminated_length": 819.8866577148438, "completions/min_length": 0.0, "completions/min_terminated_length": 399.0, "epoch": 0.20373333333333332, "grad_norm": 0.03953538462519646, "kl": 0.057159423828125, "learning_rate": 2.5000000000000004e-07, "loss": -0.1979, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01807655580341816, "mask/share_reasoning": 0.8292906284332275, "mask/share_step_conf": 0.11747653782367706, "num_tokens": 60021119.0, "reward": 0.8270571231842041, "reward_std": 0.20282432436943054, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.689464807510376, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6654306650161743, "step": 191 }, { "adv/mean_abs_final_conf": 0.5836527347564697, "adv/mean_abs_reasoning": 0.39761674404144287, "adv/mean_abs_step_conf": 0.7857372164726257, "adv/ratio_final_to_reasoning": 1.467877657324302, "adv/ratio_step_to_reasoning": 1.97611702285538, "adv/std_final_conf": 0.8277207612991333, "adv/std_reasoning": 0.6614072322845459, "adv/std_step_conf": 0.9336384534835815, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.756342062193126, "calib/avg_num_step_conf": 9.07421875, "calib/ece": 0.16856000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.58, "calib/gap": 0.27840698308783396, "calib/mean_conf": 0.7709600000000001, "calib/mu_c": 0.8756410256410254, "calib/mu_w": 0.5972340425531915, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15776000000000007, "calib/std_conf": 0.2851972622589495, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8703448275862069, "calib/step_q_c_n": 1392.0, "calib/step_q_gap": 0.11804622393421982, "calib/step_q_w": 0.7522986036519871, "calib/step_q_w_n": 931.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2954.0, "completions/max_terminated_length": 2954.0, "completions/mean_length": 818.40234375, "completions/mean_terminated_length": 831.3928833007812, "completions/min_length": 0.0, "completions/min_terminated_length": 356.0, "epoch": 0.2048, "grad_norm": 0.05431187525391579, "kl": 0.05841064453125, "learning_rate": 2.2222222222222224e-07, "loss": -0.0675, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018427275121212006, "mask/share_reasoning": 0.8483537435531616, "mask/share_step_conf": 0.11759395897388458, "num_tokens": 60335606.0, "reward": 0.9095621109008789, "reward_std": 0.23026014864444733, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7744953036308289, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7274413704872131, "step": 192 }, { "adv/mean_abs_final_conf": 0.7208178639411926, "adv/mean_abs_reasoning": 0.5967491269111633, "adv/mean_abs_step_conf": 0.7681723833084106, "adv/ratio_final_to_reasoning": 1.2079076976152814, "adv/ratio_step_to_reasoning": 1.2872618470085615, "adv/std_final_conf": 0.9065069556236267, "adv/std_reasoning": 0.8100512623786926, "adv/std_step_conf": 0.9342743754386902, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7010898109243697, "calib/avg_num_step_conf": 8.7265625, "calib/ece": 0.24729838709677424, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5887096774193549, "calib/gap": 0.23204306722689072, "calib/mean_conf": 0.7710887096774194, "calib/mu_c": 0.8758823529411766, "calib/mu_w": 0.6438392857142858, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23500000000000004, "calib/std_conf": 0.29827870470246487, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8694429708222812, "calib/step_q_c_n": 1131.0, "calib/step_q_gap": 0.10144659729553596, "calib/step_q_w": 0.7679963735267452, "calib/step_q_w_n": 1103.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2984.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 845.15234375, "completions/mean_terminated_length": 855.1739501953125, "completions/min_length": 0.0, "completions/min_terminated_length": 463.0, "epoch": 0.20586666666666667, "grad_norm": 0.04506753757596016, "kl": 0.05387115478515625, "learning_rate": 1.9444444444444447e-07, "loss": -0.045, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.017770592123270035, "mask/share_reasoning": 0.8608217239379883, "mask/share_step_conf": 0.10968892276287079, "num_tokens": 60657677.0, "reward": 0.8366061449050903, "reward_std": 0.26884931325912476, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6995316743850708, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6752431988716125, "step": 193 }, { "adv/mean_abs_final_conf": 0.6042905449867249, "adv/mean_abs_reasoning": 0.4425891637802124, "adv/mean_abs_step_conf": 0.7328290939331055, "adv/ratio_final_to_reasoning": 1.36535323148312, "adv/ratio_step_to_reasoning": 1.655777307500969, "adv/std_final_conf": 0.8594117760658264, "adv/std_reasoning": 0.720699667930603, "adv/std_step_conf": 0.9311245679855347, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.699829293274155, "calib/avg_num_step_conf": 9.1015625, "calib/ece": 0.20357723577235765, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5853658536585366, "calib/gap": 0.23855786958006153, "calib/mean_conf": 0.7653658536585366, "calib/mu_c": 0.8633103448275863, "calib/mu_w": 0.6247524752475248, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1897560975609755, "calib/std_conf": 0.30406672535509977, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8639723801787165, "calib/step_q_c_n": 1231.0, "calib/step_q_gap": 0.1521434447829021, "calib/step_q_w": 0.7118289353958144, "calib/step_q_w_n": 1099.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2497.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 770.81640625, "completions/mean_terminated_length": 795.6814575195312, "completions/min_length": 0.0, "completions/min_terminated_length": 436.0, "epoch": 0.20693333333333333, "grad_norm": 0.04400273412466049, "kl": 0.0572967529296875, "learning_rate": 1.6666666666666668e-07, "loss": -0.1635, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018388589844107628, "mask/share_reasoning": 0.8372006416320801, "mask/share_step_conf": 0.11316081881523132, "num_tokens": 60960950.0, "reward": 0.8722167611122131, "reward_std": 0.22317549586296082, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7207523584365845, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.718212366104126, "step": 194 }, { "adv/mean_abs_final_conf": 0.6190444231033325, "adv/mean_abs_reasoning": 0.3682692050933838, "adv/mean_abs_step_conf": 0.7552182674407959, "adv/ratio_final_to_reasoning": 1.6809562530387476, "adv/ratio_step_to_reasoning": 2.0507233757144903, "adv/std_final_conf": 0.8602829575538635, "adv/std_reasoning": 0.6612259745597839, "adv/std_step_conf": 0.9333444237709045, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.658011583011583, "calib/avg_num_step_conf": 8.7578125, "calib/ece": 0.22916996047430815, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5889328063241107, "calib/gap": 0.18003217503217495, "calib/mean_conf": 0.7805533596837946, "calib/mu_c": 0.8552702702702701, "calib/mu_w": 0.6752380952380952, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21237154150197615, "calib/std_conf": 0.2846113953485178, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8665965732087227, "calib/step_q_c_n": 1284.0, "calib/step_q_gap": 0.03806838949264757, "calib/step_q_w": 0.8285281837160752, "calib/step_q_w_n": 958.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2330.0, "completions/max_terminated_length": 2330.0, "completions/mean_length": 810.71484375, "completions/mean_terminated_length": 817.0984497070312, "completions/min_length": 0.0, "completions/min_terminated_length": 402.0, "epoch": 0.208, "grad_norm": 0.049184273928403854, "kl": 0.06110382080078125, "learning_rate": 1.3888888888888888e-07, "loss": -0.0271, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01838977076113224, "mask/share_reasoning": 0.858869194984436, "mask/share_step_conf": 0.11492855846881866, "num_tokens": 61274477.0, "reward": 0.8680363893508911, "reward_std": 0.18345674872398376, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7168843746185303, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7059070467948914, "step": 195 }, { "adv/mean_abs_final_conf": 0.4948877692222595, "adv/mean_abs_reasoning": 0.27376601099967957, "adv/mean_abs_step_conf": 0.7344076633453369, "adv/ratio_final_to_reasoning": 1.807703474274017, "adv/ratio_step_to_reasoning": 2.682610820326401, "adv/std_final_conf": 0.7586008906364441, "adv/std_reasoning": 0.572577714920044, "adv/std_step_conf": 0.9284154772758484, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6725761217948718, "calib/avg_num_step_conf": 9.08203125, "calib/ece": 0.24845238095238098, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7698412698412699, "calib/gap": 0.21816506410256375, "calib/mean_conf": 0.8559920634920636, "calib/mu_c": 0.9391025641025639, "calib/mu_w": 0.7209375000000001, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24269841269841275, "calib/std_conf": 0.2712467685721276, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.873566384180791, "calib/step_q_c_n": 1416.0, "calib/step_q_gap": 0.08908893643601656, "calib/step_q_w": 0.7844774477447745, "calib/step_q_w_n": 909.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2989.0, "completions/max_terminated_length": 2989.0, "completions/mean_length": 749.15625, "completions/mean_terminated_length": 752.0941772460938, "completions/min_length": 0.0, "completions/min_terminated_length": 406.0, "epoch": 0.20906666666666668, "grad_norm": 0.049532778561115265, "kl": 0.05275726318359375, "learning_rate": 1.1111111111111112e-07, "loss": 0.0266, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01997404731810093, "mask/share_reasoning": 0.8479194641113281, "mask/share_step_conf": 0.12820029258728027, "num_tokens": 61568805.0, "reward": 0.8798357844352722, "reward_std": 0.17858222126960754, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7258324027061462, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7150891423225403, "step": 196 }, { "adv/mean_abs_final_conf": 0.6538981199264526, "adv/mean_abs_reasoning": 0.5470722317695618, "adv/mean_abs_step_conf": 0.755987823009491, "adv/ratio_final_to_reasoning": 1.1952683429231117, "adv/ratio_step_to_reasoning": 1.3818793554265587, "adv/std_final_conf": 0.8599660396575928, "adv/std_reasoning": 0.7929037809371948, "adv/std_step_conf": 0.9324612617492676, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7064814814814814, "calib/avg_num_step_conf": 9.3671875, "calib/ece": 0.24326612903225808, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5443548387096774, "calib/gap": 0.21345238095238106, "calib/mean_conf": 0.7538306451612904, "calib/mu_c": 0.8467857142857144, "calib/mu_w": 0.6333333333333333, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21629032258064515, "calib/std_conf": 0.28916635522888545, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8520353982300884, "calib/step_q_c_n": 1243.0, "calib/step_q_gap": 0.1004076926023828, "calib/step_q_w": 0.7516277056277056, "calib/step_q_w_n": 1155.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2801.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 814.2109375, "completions/mean_terminated_length": 827.1349487304688, "completions/min_length": 0.0, "completions/min_terminated_length": 475.0, "epoch": 0.21013333333333334, "grad_norm": 0.08905098587274551, "kl": 0.06281280517578125, "learning_rate": 8.333333333333334e-08, "loss": 0.0467, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018287431448698044, "mask/share_reasoning": 0.8483529686927795, "mask/share_step_conf": 0.11773453652858734, "num_tokens": 61882299.0, "reward": 0.8597941398620605, "reward_std": 0.20589497685432434, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7164624929428101, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.700782060623169, "step": 197 }, { "adv/mean_abs_final_conf": 0.58735591173172, "adv/mean_abs_reasoning": 0.4391744136810303, "adv/mean_abs_step_conf": 0.7891101837158203, "adv/ratio_final_to_reasoning": 1.337409223840424, "adv/ratio_step_to_reasoning": 1.7968036368551887, "adv/std_final_conf": 0.8107753396034241, "adv/std_reasoning": 0.7014614343643188, "adv/std_step_conf": 0.9324103593826294, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.666535770384121, "calib/avg_num_step_conf": 9.44921875, "calib/ece": 0.17724279835390927, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.588477366255144, "calib/gap": 0.21069041839211777, "calib/mean_conf": 0.7669547325102881, "calib/mu_c": 0.8484563758389263, "calib/mu_w": 0.6377659574468085, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1655144032921809, "calib/std_conf": 0.29970456304725857, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.870100154083205, "calib/step_q_c_n": 1298.0, "calib/step_q_gap": 0.11071567593869103, "calib/step_q_w": 0.7593844781445139, "calib/step_q_w_n": 1121.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2972.0, "completions/max_terminated_length": 2972.0, "completions/mean_length": 781.1953125, "completions/mean_terminated_length": 812.951171875, "completions/min_length": 0.0, "completions/min_terminated_length": 392.0, "epoch": 0.2112, "grad_norm": 0.05872613564133644, "kl": 0.05881500244140625, "learning_rate": 5.555555555555556e-08, "loss": -0.1555, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.018448233604431152, "mask/share_reasoning": 0.8231604099273682, "mask/share_step_conf": 0.11932889372110367, "num_tokens": 62187669.0, "reward": 0.8654526472091675, "reward_std": 0.21482285857200623, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7112331986427307, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7134221196174622, "step": 198 }, { "adv/mean_abs_final_conf": 0.6843090057373047, "adv/mean_abs_reasoning": 0.6271142959594727, "adv/mean_abs_step_conf": 0.777929961681366, "adv/ratio_final_to_reasoning": 1.091203007404456, "adv/ratio_step_to_reasoning": 1.2404915127810128, "adv/std_final_conf": 0.876064121723175, "adv/std_reasoning": 0.8429972529411316, "adv/std_step_conf": 0.9352033734321594, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6885495666305526, "calib/avg_num_step_conf": 8.58984375, "calib/ece": 0.24605691056910572, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6056910569105691, "calib/gap": 0.20585319609967523, "calib/mean_conf": 0.7724796747967478, "calib/mu_c": 0.8595070422535213, "calib/mu_w": 0.653653846153846, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22065040650406506, "calib/std_conf": 0.2989819270061604, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8696669442131559, "calib/step_q_c_n": 1201.0, "calib/step_q_gap": 0.09017796625724406, "calib/step_q_w": 0.7794889779559119, "calib/step_q_w_n": 998.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2965.0, "completions/max_terminated_length": 2965.0, "completions/mean_length": 834.5078125, "completions/mean_terminated_length": 847.7540283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 399.0, "epoch": 0.21226666666666666, "grad_norm": 0.03624487668275833, "kl": 0.05606842041015625, "learning_rate": 2.777777777777778e-08, "loss": -0.074, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018301576375961304, "mask/share_reasoning": 0.8528690934181213, "mask/share_step_conf": 0.11320433020591736, "num_tokens": 62505503.0, "reward": 0.848616361618042, "reward_std": 0.28091567754745483, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7004519701004028, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.693655788898468, "step": 199 }, { "adv/mean_abs_final_conf": 0.5117772817611694, "adv/mean_abs_reasoning": 0.33070600032806396, "adv/mean_abs_step_conf": 0.7476848363876343, "adv/ratio_final_to_reasoning": 1.5475294710512684, "adv/ratio_step_to_reasoning": 2.2608747214925726, "adv/std_final_conf": 0.7761644124984741, "adv/std_reasoning": 0.6402080655097961, "adv/std_step_conf": 0.925877034664154, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7651164421997755, "calib/avg_num_step_conf": 8.72265625, "calib/ece": 0.16052, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.624, "calib/gap": 0.2816526374859708, "calib/mean_conf": 0.8009200000000001, "calib/mu_c": 0.9000617283950617, "calib/mu_w": 0.6184090909090909, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15672, "calib/std_conf": 0.2658938765748471, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8784808259587019, "calib/step_q_c_n": 1356.0, "calib/step_q_gap": 0.06750021022324004, "calib/step_q_w": 0.8109806157354619, "calib/step_q_w_n": 877.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 788.03515625, "completions/mean_terminated_length": 803.7330932617188, "completions/min_length": 0.0, "completions/min_terminated_length": 447.0, "epoch": 0.21333333333333335, "grad_norm": 0.04874600097537041, "kl": 0.0551300048828125, "learning_rate": 0.0, "loss": -0.1078, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018403755500912666, "mask/share_reasoning": 0.850422739982605, "mask/share_step_conf": 0.11164219677448273, "num_tokens": 62815288.0, "reward": 0.9284921288490295, "reward_std": 0.15986087918281555, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7874097228050232, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.747699499130249, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 0.2748976366652641, "train_runtime": 22212.2891, "train_samples_per_second": 2.305, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 62815288, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }