{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.47714588046073914, "adv/mean_abs_step_conf": 0.7502421140670776, "adv/ratio_final_to_reasoning": 1.622059942565935, "adv/ratio_step_to_reasoning": 1.5723537492194897, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.7393431663513184, "adv/std_step_conf": 0.9357826709747314, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.04299849271774292, "learning_rate": 2.5000000000000004e-07, "loss": -0.0136, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 1.264374852180481, "reward_std": 0.26098379492759705, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7420004606246948, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.5104547739028931, "adv/mean_abs_step_conf": 0.770934522151947, "adv/ratio_final_to_reasoning": 1.503115479781084, "adv/ratio_step_to_reasoning": 1.5102895722914849, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.7575037479400635, "adv/std_step_conf": 0.9358851313591003, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.04045659676194191, "learning_rate": 5.000000000000001e-07, "loss": -0.0158, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 1.198354721069336, "reward_std": 0.24474793672561646, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7291916012763977, "step": 2 }, { "adv/mean_abs_final_conf": 0.7855177521705627, "adv/mean_abs_reasoning": 0.4552287757396698, "adv/mean_abs_step_conf": 0.7503337860107422, "adv/ratio_final_to_reasoning": 1.7255450314937346, "adv/ratio_step_to_reasoning": 1.6482564943122864, "adv/std_final_conf": 0.9304200410842896, "adv/std_reasoning": 0.7206604480743408, "adv/std_step_conf": 0.9357056617736816, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4619900301718483, "calib/avg_num_step_conf": 5.15625, "calib/ece": 0.2755335968379446, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3241106719367589, "calib/gap": -0.007893217893217952, "calib/mean_conf": 0.8825691699604744, "calib/mu_c": 0.8794805194805194, "calib/mu_w": 0.8873737373737374, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2747035573122529, "calib/std_conf": 0.044117613178770984, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.804808972503618, "calib/step_q_c_n": 691.0, "calib/step_q_gap": 0.04946716010298213, "calib/step_q_w": 0.7553418124006359, "calib/step_q_w_n": 629.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2973.0, "completions/max_terminated_length": 2973.0, "completions/mean_length": 504.58203125, "completions/mean_terminated_length": 508.55511474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0032, "grad_norm": 0.04432070627808571, "learning_rate": 7.5e-07, "loss": 0.0416, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03285394608974457, "mask/share_reasoning": 0.8470344543457031, "mask/share_step_conf": 0.11229914426803589, "num_tokens": 693090.0, "reward": 1.2373484373092651, "reward_std": 0.2528243064880371, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.671227753162384, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7435314059257507, "step": 3 }, { "adv/mean_abs_final_conf": 0.744471549987793, "adv/mean_abs_reasoning": 0.43164411187171936, "adv/mean_abs_step_conf": 0.7570846676826477, "adv/ratio_final_to_reasoning": 1.7247346355764561, "adv/ratio_step_to_reasoning": 1.7539557400649688, "adv/std_final_conf": 0.9300883412361145, "adv/std_reasoning": 0.7013282775878906, "adv/std_step_conf": 0.9355937838554382, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4211795373085696, "calib/avg_num_step_conf": 5.05078125, "calib/ece": 0.27570866141732275, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3110236220472441, "calib/gap": -0.012273053111762744, "calib/mean_conf": 0.885944881889764, "calib/mu_c": 0.8811612903225807, "calib/mu_w": 0.8934343434343435, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27570866141732275, "calib/std_conf": 0.043322778116439024, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7986074270557029, "calib/step_q_c_n": 754.0, "calib/step_q_gap": 0.00246642520041529, "calib/step_q_w": 0.7961410018552876, "calib/step_q_w_n": 539.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 508.6171875, "completions/mean_terminated_length": 508.6171875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.004266666666666667, "grad_norm": 0.03628219664096832, "learning_rate": 1.0000000000000002e-06, "loss": 0.0335, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03336188197135925, "mask/share_reasoning": 0.8539849519729614, "mask/share_step_conf": 0.11265319585800171, "num_tokens": 929464.0, "reward": 1.2149934768676758, "reward_std": 0.23332801461219788, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6731215119361877, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7186670899391174, "step": 4 }, { "adv/mean_abs_final_conf": 0.78107088804245, "adv/mean_abs_reasoning": 0.4505809545516968, "adv/mean_abs_step_conf": 0.7702205181121826, "adv/ratio_final_to_reasoning": 1.7334751505854757, "adv/ratio_step_to_reasoning": 1.7093943060210115, "adv/std_final_conf": 0.9305281043052673, "adv/std_reasoning": 0.7014207243919373, "adv/std_step_conf": 0.9359554052352905, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.4787919017590442, "calib/avg_num_step_conf": 4.55078125, "calib/ece": 0.3518292682926829, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.32113821138211385, "calib/gap": -0.0009764354463989156, "calib/mean_conf": 0.8843495934959348, "calib/mu_c": 0.8838931297709923, "calib/mu_w": 0.8848695652173912, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.3518292682926829, "calib/std_conf": 0.051436516044164424, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.8001442307692307, "calib/step_q_c_n": 624.0, "calib/step_q_gap": 0.026244045926347215, "calib/step_q_w": 0.7739001848428835, "calib/step_q_w_n": 541.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2823.0, "completions/max_terminated_length": 2823.0, "completions/mean_length": 518.09375, "completions/mean_terminated_length": 520.1255493164062, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.005333333333333333, "grad_norm": 0.03847824037075043, "learning_rate": 1.25e-06, "loss": -0.0223, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03429059311747551, "mask/share_reasoning": 0.8538260459899902, "mask/share_step_conf": 0.10797706246376038, "num_tokens": 1168784.0, "reward": 1.1169718503952026, "reward_std": 0.2400004267692566, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5990898609161377, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6705517768859863, "step": 5 }, { "adv/mean_abs_final_conf": 0.7595282793045044, "adv/mean_abs_reasoning": 0.3384864032268524, "adv/mean_abs_step_conf": 0.775572657585144, "adv/ratio_final_to_reasoning": 2.243895979465595, "adv/ratio_step_to_reasoning": 2.2912963421616612, "adv/std_final_conf": 0.9284359216690063, "adv/std_reasoning": 0.6184952855110168, "adv/std_step_conf": 0.9360383749008179, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5289813938820561, "calib/avg_num_step_conf": 5.06640625, "calib/ece": 0.29503906250000006, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.3125, "calib/gap": 0.002615578681804065, "calib/mean_conf": 0.8823046875, "calib/mu_c": 0.8833774834437087, "calib/mu_w": 0.8807619047619046, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.29375000000000007, "calib/std_conf": 0.04716743092990484, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.792674094707521, "calib/step_q_c_n": 718.0, "calib/step_q_gap": -0.009294817209577366, "calib/step_q_w": 0.8019689119170984, "calib/step_q_w_n": 579.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 435.56640625, "completions/mean_terminated_length": 437.2745361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.0064, "grad_norm": 0.039497025310993195, "learning_rate": 1.5e-06, "loss": -0.0103, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.037563733756542206, "mask/share_reasoning": 0.8333431482315063, "mask/share_step_conf": 0.12518689036369324, "num_tokens": 1386241.0, "reward": 1.2087037563323975, "reward_std": 0.20760923624038696, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6677120923995972, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7162538766860962, "step": 6 }, { "adv/mean_abs_final_conf": 0.7652530670166016, "adv/mean_abs_reasoning": 0.4704771637916565, "adv/mean_abs_step_conf": 0.7372301816940308, "adv/ratio_final_to_reasoning": 1.6265466762494811, "adv/ratio_step_to_reasoning": 1.566983986539465, "adv/std_final_conf": 0.9297286868095398, "adv/std_reasoning": 0.7392281293869019, "adv/std_step_conf": 0.9354885816574097, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.46026745531696034, "calib/avg_num_step_conf": 5.26953125, "calib/ece": 0.27733333333333327, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.30980392156862746, "calib/gap": -0.005463546354635529, "calib/mean_conf": 0.8812549019607842, "calib/mu_c": 0.879090909090909, "calib/mu_w": 0.8845544554455446, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.27733333333333327, "calib/std_conf": 0.045770031781768666, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7868014705882352, "calib/step_q_c_n": 816.0, "calib/step_q_gap": -0.001997778942721573, "calib/step_q_w": 0.7887992495309568, "calib/step_q_w_n": 533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 524.5234375, "completions/mean_terminated_length": 526.5804443359375, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.007466666666666667, "grad_norm": 0.034977804869413376, "learning_rate": 1.75e-06, "loss": 0.0616, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03061247244477272, "mask/share_reasoning": 0.8583376407623291, "mask/share_step_conf": 0.10714363306760788, "num_tokens": 1627943.0, "reward": 1.2274894714355469, "reward_std": 0.25535452365875244, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6677761673927307, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7350075840950012, "step": 7 }, { "adv/mean_abs_final_conf": 0.7622976899147034, "adv/mean_abs_reasoning": 0.4128996729850769, "adv/mean_abs_step_conf": 0.7528561949729919, "adv/ratio_final_to_reasoning": 1.8462056034184713, "adv/ratio_step_to_reasoning": 1.823339286103532, "adv/std_final_conf": 0.9313320517539978, "adv/std_reasoning": 0.7013623118400574, "adv/std_step_conf": 0.9357892274856567, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5052173913043478, "calib/avg_num_step_conf": 5.0546875, "calib/ece": 0.3403200000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.312, "calib/gap": 0.0033301127214171444, "calib/mean_conf": 0.88032, "calib/mu_c": 0.8818518518518519, "calib/mu_w": 0.8785217391304347, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3403200000000002, "calib/std_conf": 0.0450144154688251, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7924, "calib/step_q_c_n": 625.0, "calib/step_q_gap": 0.05108460388639757, "calib/step_q_w": 0.7413153961136024, "calib/step_q_w_n": 669.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2684.0, "completions/max_terminated_length": 2684.0, "completions/mean_length": 535.90234375, "completions/mean_terminated_length": 538.0039672851562, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.008533333333333334, "grad_norm": 0.03353934735059738, "learning_rate": 2.0000000000000003e-06, "loss": -0.0109, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03260628134012222, "mask/share_reasoning": 0.856112003326416, "mask/share_step_conf": 0.10737546533346176, "num_tokens": 1871646.0, "reward": 1.1858603954315186, "reward_std": 0.24610640108585358, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6166367530822754, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.727541983127594, "step": 8 }, { "adv/mean_abs_final_conf": 0.8015788793563843, "adv/mean_abs_reasoning": 0.42125481367111206, "adv/mean_abs_step_conf": 0.7736630439758301, "adv/ratio_final_to_reasoning": 1.902836129920652, "adv/ratio_step_to_reasoning": 1.836567841762053, "adv/std_final_conf": 0.9309937357902527, "adv/std_reasoning": 0.68181312084198, "adv/std_step_conf": 0.9360260963439941, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4510899558498896, "calib/avg_num_step_conf": 4.765625, "calib/ece": 0.26352226720647776, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.2550607287449393, "calib/gap": -0.007388245033112639, "calib/mean_conf": 0.8748582995951416, "calib/mu_c": 0.8719867549668875, "calib/mu_w": 0.8793750000000001, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.26352226720647776, "calib/std_conf": 0.04740086781034078, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7605013550135501, "calib/step_q_c_n": 738.0, "calib/step_q_gap": 0.0124930562583635, "calib/step_q_w": 0.7480082987551866, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 491.46875, "completions/mean_terminated_length": 499.2698669433594, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.0096, "grad_norm": 0.05028747767210007, "learning_rate": 2.25e-06, "loss": -0.0408, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033226363360881805, "mask/share_reasoning": 0.8455919027328491, "mask/share_step_conf": 0.10555671155452728, "num_tokens": 2104998.0, "reward": 1.1680667400360107, "reward_std": 0.28994813561439514, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6573429703712463, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6847077012062073, "step": 9 }, { "adv/mean_abs_final_conf": 0.7939379215240479, "adv/mean_abs_reasoning": 0.4176982045173645, "adv/mean_abs_step_conf": 0.7548781633377075, "adv/ratio_final_to_reasoning": 1.9007453537929737, "adv/ratio_step_to_reasoning": 1.8072334407325081, "adv/std_final_conf": 0.9302391409873962, "adv/std_reasoning": 0.6817051768302917, "adv/std_step_conf": 0.9354656934738159, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.48880249867091974, "calib/avg_num_step_conf": 4.8828125, "calib/ece": 0.2816733067729084, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.35856573705179284, "calib/gap": 0.0021325093035619025, "calib/mean_conf": 0.8872509960159362, "calib/mu_c": 0.888092105263158, "calib/mu_w": 0.8859595959595961, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2816733067729084, "calib/std_conf": 0.04676819105147297, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7871965317919075, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.0032180371682516418, "calib/step_q_w": 0.7839784946236559, "calib/step_q_w_n": 558.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1937.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 511.09375, "completions/mean_terminated_length": 513.0980834960938, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.010666666666666666, "grad_norm": 0.03779426962137222, "learning_rate": 2.5e-06, "loss": 0.0688, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03259393572807312, "mask/share_reasoning": 0.8553390502929688, "mask/share_step_conf": 0.10816076397895813, "num_tokens": 2342638.0, "reward": 1.2232654094696045, "reward_std": 0.24356569349765778, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6673445105552673, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7321712970733643, "step": 10 }, { "adv/mean_abs_final_conf": 0.7801154851913452, "adv/mean_abs_reasoning": 0.3890037536621094, "adv/mean_abs_step_conf": 0.7646522521972656, "adv/ratio_final_to_reasoning": 2.0054189139494976, "adv/ratio_step_to_reasoning": 1.9656680558960529, "adv/std_final_conf": 0.9274177551269531, "adv/std_reasoning": 0.6612036824226379, "adv/std_step_conf": 0.9355158805847168, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.43217879256965946, "calib/avg_num_step_conf": 5.6171875, "calib/ece": 0.29846456692913387, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.35826771653543305, "calib/gap": -0.02004127966976288, "calib/mean_conf": 0.8767322834645669, "calib/mu_c": 0.8686842105263157, "calib/mu_w": 0.8887254901960786, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.28838582677165353, "calib/std_conf": 0.09503807349936566, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7773716381418093, "calib/step_q_c_n": 818.0, "calib/step_q_gap": 0.01146841233535767, "calib/step_q_w": 0.7659032258064516, "calib/step_q_w_n": 620.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 535.8984375, "completions/mean_terminated_length": 535.8984375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.011733333333333333, "grad_norm": 0.039491284638643265, "learning_rate": 2.7500000000000004e-06, "loss": 0.054, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03238622844219208, "mask/share_reasoning": 0.8498245477676392, "mask/share_step_conf": 0.11778921633958817, "num_tokens": 2584308.0, "reward": 1.2056663036346436, "reward_std": 0.2091943919658661, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6545777320861816, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.720174252986908, "step": 11 }, { "adv/mean_abs_final_conf": 0.7940953969955444, "adv/mean_abs_reasoning": 0.44334566593170166, "adv/mean_abs_step_conf": 0.7410632371902466, "adv/ratio_final_to_reasoning": 1.7911427989867312, "adv/ratio_step_to_reasoning": 1.6715247134149473, "adv/std_final_conf": 0.9285423159599304, "adv/std_reasoning": 0.7014912366867065, "adv/std_step_conf": 0.9357432126998901, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4787249031349067, "calib/avg_num_step_conf": 5.58984375, "calib/ece": 0.2275793650793651, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.39285714285714285, "calib/gap": -0.005296935540683334, "calib/mean_conf": 0.8849603174603174, "calib/mu_c": 0.8831736526946108, "calib/mu_w": 0.8884705882352941, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.22492063492063497, "calib/std_conf": 0.05906344460652552, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7653591790193843, "calib/step_q_c_n": 877.0, "calib/step_q_gap": -0.004514467190001925, "calib/step_q_w": 0.7698736462093863, "calib/step_q_w_n": 554.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 480.0546875, "completions/mean_terminated_length": 481.9372863769531, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.0128, "grad_norm": 0.0577828474342823, "learning_rate": 3e-06, "loss": 0.0138, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035471174865961075, "mask/share_reasoning": 0.831091046333313, "mask/share_step_conf": 0.12953157722949982, "num_tokens": 2811378.0, "reward": 1.2840325832366943, "reward_std": 0.2465120553970337, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7054234743118286, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7684301137924194, "step": 12 }, { "adv/mean_abs_final_conf": 0.7640044689178467, "adv/mean_abs_reasoning": 0.4305124580860138, "adv/mean_abs_step_conf": 0.7631770968437195, "adv/ratio_final_to_reasoning": 1.7746396290469326, "adv/ratio_step_to_reasoning": 1.7727177983110567, "adv/std_final_conf": 0.930860698223114, "adv/std_reasoning": 0.7013241648674011, "adv/std_step_conf": 0.9360091090202332, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5543396226415095, "calib/avg_num_step_conf": 4.91015625, "calib/ece": 0.2992578125, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4140625, "calib/gap": 0.011764779874213649, "calib/mean_conf": 0.8851953125, "calib/mu_c": 0.8900666666666666, "calib/mu_w": 0.8783018867924529, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2992578125, "calib/std_conf": 0.05742489097096609, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7743633762517882, "calib/step_q_c_n": 699.0, "calib/step_q_gap": 0.018467318904118013, "calib/step_q_w": 0.7558960573476702, "calib/step_q_w_n": 558.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1411.0, "completions/max_terminated_length": 1411.0, "completions/mean_length": 468.6484375, "completions/mean_terminated_length": 470.4862976074219, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.013866666666666666, "grad_norm": 0.04164907708764076, "learning_rate": 3.2500000000000002e-06, "loss": 0.0422, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0350169837474823, "mask/share_reasoning": 0.8462280035018921, "mask/share_step_conf": 0.11484874039888382, "num_tokens": 3035944.0, "reward": 1.262359619140625, "reward_std": 0.22623397409915924, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6652238368988037, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.771935224533081, "step": 13 }, { "adv/mean_abs_final_conf": 0.7712956070899963, "adv/mean_abs_reasoning": 0.47973084449768066, "adv/mean_abs_step_conf": 0.7793824672698975, "adv/ratio_final_to_reasoning": 1.6077673886022672, "adv/ratio_step_to_reasoning": 1.624624466425497, "adv/std_final_conf": 0.9276650547981262, "adv/std_reasoning": 0.7205986380577087, "adv/std_step_conf": 0.9358716011047363, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4078828828828829, "calib/avg_num_step_conf": 5.36328125, "calib/ece": 0.3528286852589641, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6414342629482072, "calib/gap": -0.010880952380952658, "calib/mean_conf": 0.910597609561753, "calib/mu_c": 0.9057857142857141, "calib/mu_w": 0.9166666666666667, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3528286852589641, "calib/std_conf": 0.045085591843685116, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7475923392612859, "calib/step_q_c_n": 731.0, "calib/step_q_gap": 0.03533377228309276, "calib/step_q_w": 0.7122585669781931, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2698.0, "completions/max_terminated_length": 2698.0, "completions/mean_length": 536.953125, "completions/mean_terminated_length": 541.1810913085938, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.014933333333333333, "grad_norm": 0.03964659944176674, "learning_rate": 3.5e-06, "loss": 0.0449, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0325857549905777, "mask/share_reasoning": 0.8444947004318237, "mask/share_step_conf": 0.11510701477527618, "num_tokens": 3278804.0, "reward": 1.217205286026001, "reward_std": 0.23579458892345428, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6093109250068665, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7598154544830322, "step": 14 }, { "adv/mean_abs_final_conf": 0.7839799523353577, "adv/mean_abs_reasoning": 0.40933963656425476, "adv/mean_abs_step_conf": 0.7739418745040894, "adv/ratio_final_to_reasoning": 1.9152309776683327, "adv/ratio_step_to_reasoning": 1.8907083638420203, "adv/std_final_conf": 0.9218094348907471, "adv/std_reasoning": 0.6613172888755798, "adv/std_step_conf": 0.9354211091995239, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.487977369165488, "calib/avg_num_step_conf": 5.10546875, "calib/ece": 0.3059215686274509, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6549019607843137, "calib/gap": 0.009904847627619895, "calib/mean_conf": 0.909843137254902, "calib/mu_c": 0.9137662337662338, "calib/mu_w": 0.9038613861386139, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3059215686274509, "calib/std_conf": 0.06864497255316152, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7073884514435695, "calib/step_q_c_n": 762.0, "calib/step_q_gap": -0.014597603602302112, "calib/step_q_w": 0.7219860550458717, "calib/step_q_w_n": 545.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 468.61328125, "completions/mean_terminated_length": 470.4510192871094, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.016, "grad_norm": 0.042239848524332047, "learning_rate": 3.7500000000000005e-06, "loss": 0.0464, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033957935869693756, "mask/share_reasoning": 0.84504234790802, "mask/share_step_conf": 0.11709348857402802, "num_tokens": 3506649.0, "reward": 1.2645666599273682, "reward_std": 0.22863566875457764, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6646316647529602, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7724852561950684, "step": 15 }, { "adv/mean_abs_final_conf": 0.7644206285476685, "adv/mean_abs_reasoning": 0.44867604970932007, "adv/mean_abs_step_conf": 0.7611935138702393, "adv/ratio_final_to_reasoning": 1.7037250574059104, "adv/ratio_step_to_reasoning": 1.696532530237322, "adv/std_final_conf": 0.9242152571678162, "adv/std_reasoning": 0.7205649018287659, "adv/std_step_conf": 0.9357426166534424, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5098833218943033, "calib/avg_num_step_conf": 6.55859375, "calib/ece": 0.30337349397590363, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7991967871485943, "calib/gap": 0.01745161290322539, "calib/mean_conf": 0.925863453815261, "calib/mu_c": 0.9324516129032256, "calib/mu_w": 0.9150000000000003, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30337349397590363, "calib/std_conf": 0.07098388745929748, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6563173359451517, "calib/step_q_c_n": 1021.0, "calib/step_q_gap": 0.024645603422355444, "calib/step_q_w": 0.6316717325227963, "calib/step_q_w_n": 658.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2619.0, "completions/max_terminated_length": 2619.0, "completions/mean_length": 627.3984375, "completions/mean_terminated_length": 634.8379516601562, "completions/min_length": 0.0, "completions/min_terminated_length": 202.0, "epoch": 0.017066666666666667, "grad_norm": 0.037402741611003876, "learning_rate": 4.000000000000001e-06, "loss": 0.0108, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02552606537938118, "mask/share_reasoning": 0.8506210446357727, "mask/share_step_conf": 0.11213415116071701, "num_tokens": 3776111.0, "reward": 1.262770414352417, "reward_std": 0.23914945125579834, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6576437950134277, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7761359214782715, "step": 16 }, { "adv/mean_abs_final_conf": 0.7625212073326111, "adv/mean_abs_reasoning": 0.3886851668357849, "adv/mean_abs_step_conf": 0.7757367491722107, "adv/ratio_final_to_reasoning": 1.9617965191215225, "adv/ratio_step_to_reasoning": 1.9957971524546259, "adv/std_final_conf": 0.9166886210441589, "adv/std_reasoning": 0.6612927913665771, "adv/std_step_conf": 0.9357386827468872, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5342518178339073, "calib/avg_num_step_conf": 5.74609375, "calib/ece": 0.1399604743083005, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8616600790513834, "calib/gap": 0.0019737849215459846, "calib/mean_conf": 0.9325296442687746, "calib/mu_c": 0.9329353233830845, "calib/mu_w": 0.9309615384615385, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.13901185770750998, "calib/std_conf": 0.03935151345673892, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6475511945392493, "calib/step_q_c_n": 1172.0, "calib/step_q_gap": 0.02213313433858033, "calib/step_q_w": 0.6254180602006689, "calib/step_q_w_n": 299.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2625.0, "completions/max_terminated_length": 2625.0, "completions/mean_length": 512.58203125, "completions/mean_terminated_length": 516.6181030273438, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.018133333333333335, "grad_norm": 0.06586568057537079, "learning_rate": 4.25e-06, "loss": -0.001, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03208363801240921, "mask/share_reasoning": 0.8363739848136902, "mask/share_step_conf": 0.1237298846244812, "num_tokens": 4010860.0, "reward": 1.4073078632354736, "reward_std": 0.2265596091747284, "rewards/accuracy_reward_step": 0.78515625, "rewards/final_brier_reward_step": 0.7993745803833008, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8310579061508179, "step": 17 }, { "adv/mean_abs_final_conf": 0.7089370489120483, "adv/mean_abs_reasoning": 0.3773331940174103, "adv/mean_abs_step_conf": 0.7654873728752136, "adv/ratio_final_to_reasoning": 1.8788091271910146, "adv/ratio_step_to_reasoning": 2.028677532249903, "adv/std_final_conf": 0.9150155782699585, "adv/std_reasoning": 0.6815622448921204, "adv/std_step_conf": 0.9358413219451904, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5111811546118116, "calib/avg_num_step_conf": 5.93359375, "calib/ece": 0.38906882591093117, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.8866396761133604, "calib/gap": 0.004445919044459012, "calib/mean_conf": 0.9406477732793521, "calib/mu_c": 0.9426277372262772, "calib/mu_w": 0.9381818181818182, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.3875303643724696, "calib/std_conf": 0.04027719521709381, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.621609947643979, "calib/step_q_c_n": 764.0, "calib/step_q_gap": 0.017954318504906253, "calib/step_q_w": 0.6036556291390728, "calib/step_q_w_n": 755.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2898.0, "completions/max_terminated_length": 2898.0, "completions/mean_length": 557.921875, "completions/mean_terminated_length": 560.10986328125, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.0192, "grad_norm": 0.03878249600529671, "learning_rate": 4.5e-06, "loss": -0.0734, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03142430633306503, "mask/share_reasoning": 0.8538853526115417, "mask/share_step_conf": 0.11078407615423203, "num_tokens": 4264408.0, "reward": 1.2024595737457275, "reward_std": 0.22675225138664246, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.583086371421814, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7613070607185364, "step": 18 }, { "adv/mean_abs_final_conf": 0.6971423625946045, "adv/mean_abs_reasoning": 0.3849453032016754, "adv/mean_abs_step_conf": 0.760978102684021, "adv/ratio_final_to_reasoning": 1.8110166737879823, "adv/ratio_step_to_reasoning": 1.9768473504022452, "adv/std_final_conf": 0.9060702919960022, "adv/std_reasoning": 0.7011492848396301, "adv/std_step_conf": 0.9357239603996277, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5756445701793866, "calib/avg_num_step_conf": 5.19140625, "calib/ece": 0.3559448818897637, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.952755905511811, "calib/gap": 0.011833729827043005, "calib/mean_conf": 0.9504330708661417, "calib/mu_c": 0.9552317880794701, "calib/mu_w": 0.9433980582524271, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3559448818897637, "calib/std_conf": 0.03301469263736767, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6374509803921569, "calib/step_q_c_n": 765.0, "calib/step_q_gap": 0.023266583229036297, "calib/step_q_w": 0.6141843971631206, "calib/step_q_w_n": 564.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2597.0, "completions/max_terminated_length": 2597.0, "completions/mean_length": 518.5390625, "completions/mean_terminated_length": 518.5390625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.020266666666666665, "grad_norm": 0.03828023374080658, "learning_rate": 4.75e-06, "loss": 0.0769, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.030845241621136665, "mask/share_reasoning": 0.857416033744812, "mask/share_step_conf": 0.11173869669437408, "num_tokens": 4501914.0, "reward": 1.2848021984100342, "reward_std": 0.20347543060779572, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6318714618682861, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8106632232666016, "step": 19 }, { "adv/mean_abs_final_conf": 0.6998113393783569, "adv/mean_abs_reasoning": 0.44451165199279785, "adv/mean_abs_step_conf": 0.772807240486145, "adv/ratio_final_to_reasoning": 1.5743374470410858, "adv/ratio_step_to_reasoning": 1.738553392293677, "adv/std_final_conf": 0.8987686634063721, "adv/std_reasoning": 0.739271342754364, "adv/std_step_conf": 0.9359571933746338, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.45597714133829487, "calib/avg_num_step_conf": 5.5234375, "calib/ece": 0.3706400000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.996, "calib/gap": -0.003477971958269599, "calib/mean_conf": 0.96664, "calib/mu_c": 0.965234899328859, "calib/mu_w": 0.9687128712871286, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3706400000000001, "calib/std_conf": 0.018652356419498316, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5813636363636364, "calib/step_q_c_n": 814.0, "calib/step_q_gap": -0.010903030303030303, "calib/step_q_w": 0.5922666666666667, "calib/step_q_w_n": 600.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2115.0, "completions/max_terminated_length": 2115.0, "completions/mean_length": 487.625, "completions/mean_terminated_length": 487.625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.021333333333333333, "grad_norm": 0.03156009316444397, "learning_rate": 5e-06, "loss": 0.0208, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.035062696784734726, "mask/share_reasoning": 0.8308588266372681, "mask/share_step_conf": 0.13407844305038452, "num_tokens": 4731618.0, "reward": 1.2386574745178223, "reward_std": 0.26106804609298706, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6013144850730896, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7825314998626709, "step": 20 }, { "adv/mean_abs_final_conf": 0.7398608922958374, "adv/mean_abs_reasoning": 0.48953911662101746, "adv/mean_abs_step_conf": 0.7502537965774536, "adv/ratio_final_to_reasoning": 1.5113417236249367, "adv/ratio_step_to_reasoning": 1.5325717008193067, "adv/std_final_conf": 0.8951833844184875, "adv/std_reasoning": 0.7393571138381958, "adv/std_step_conf": 0.935997486114502, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5368434472208058, "calib/avg_num_step_conf": 5.93359375, "calib/ece": 0.39059055118110253, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9921259842519685, "calib/gap": 0.0013985211626722505, "calib/mean_conf": 0.9732677165354332, "calib/mu_c": 0.9738513513513514, "calib/mu_w": 0.9724528301886791, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39059055118110253, "calib/std_conf": 0.016288225213710716, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5789975845410629, "calib/step_q_c_n": 828.0, "calib/step_q_gap": -0.0029416339828155502, "calib/step_q_w": 0.5819392185238784, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2458.0, "completions/max_terminated_length": 2458.0, "completions/mean_length": 516.46484375, "completions/mean_terminated_length": 516.46484375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.0224, "grad_norm": 0.024165457114577293, "learning_rate": 4.9722222222222224e-06, "loss": -0.0086, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03220517560839653, "mask/share_reasoning": 0.8394327163696289, "mask/share_step_conf": 0.12836214900016785, "num_tokens": 4966793.0, "reward": 1.256303310394287, "reward_std": 0.25476592779159546, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5999652147293091, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7992894053459167, "step": 21 }, { "adv/mean_abs_final_conf": 0.6778824925422668, "adv/mean_abs_reasoning": 0.4465583562850952, "adv/mean_abs_step_conf": 0.7399794459342957, "adv/ratio_final_to_reasoning": 1.5180154687542962, "adv/ratio_step_to_reasoning": 1.6570722180414699, "adv/std_final_conf": 0.8620956540107727, "adv/std_reasoning": 0.7391456365585327, "adv/std_step_conf": 0.9359897375106812, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.463719512195122, "calib/avg_num_step_conf": 5.83203125, "calib/ece": 0.3327952755905512, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0016897018970188915, "calib/mean_conf": 0.9784645669291339, "calib/mu_c": 0.9778658536585366, "calib/mu_w": 0.9795555555555555, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3327952755905512, "calib/std_conf": 0.012813191575706534, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5943560209424084, "calib/step_q_c_n": 955.0, "calib/step_q_gap": 0.011902489343895373, "calib/step_q_w": 0.582453531598513, "calib/step_q_w_n": 538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 504.71875, "completions/mean_terminated_length": 504.71875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.023466666666666667, "grad_norm": 0.03978949412703514, "learning_rate": 4.944444444444445e-06, "loss": 0.0607, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03203662857413292, "mask/share_reasoning": 0.8401437997817993, "mask/share_step_conf": 0.12781958281993866, "num_tokens": 5197817.0, "reward": 1.3022887706756592, "reward_std": 0.2456250935792923, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6543769836425781, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8118190765380859, "step": 22 }, { "adv/mean_abs_final_conf": 0.7471860647201538, "adv/mean_abs_reasoning": 0.5141867995262146, "adv/mean_abs_step_conf": 0.7579343318939209, "adv/ratio_final_to_reasoning": 1.4531412813565632, "adv/ratio_step_to_reasoning": 1.4740447101954033, "adv/std_final_conf": 0.8914094567298889, "adv/std_reasoning": 0.7576435804367065, "adv/std_step_conf": 0.9362046122550964, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6178689148073022, "calib/avg_num_step_conf": 5.9375, "calib/ece": 0.4417460317460319, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.006001521298174661, "calib/mean_conf": 0.9814285714285715, "calib/mu_c": 0.9841911764705884, "calib/mu_w": 0.9781896551724137, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4417460317460319, "calib/std_conf": 0.01289236797207704, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5900246913580247, "calib/step_q_c_n": 810.0, "calib/step_q_gap": -0.005792210050425961, "calib/step_q_w": 0.5958169014084507, "calib/step_q_w_n": 710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2592.0, "completions/max_terminated_length": 2592.0, "completions/mean_length": 545.265625, "completions/mean_terminated_length": 551.7312622070312, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.024533333333333334, "grad_norm": 0.03615272417664528, "learning_rate": 4.9166666666666665e-06, "loss": -0.0227, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032736070454120636, "mask/share_reasoning": 0.8280990719795227, "mask/share_step_conf": 0.12744614481925964, "num_tokens": 5441341.0, "reward": 1.1947392225265503, "reward_std": 0.2796531021595001, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5466066598892212, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7702640295028687, "step": 23 }, { "adv/mean_abs_final_conf": 0.6879265904426575, "adv/mean_abs_reasoning": 0.5578153729438782, "adv/mean_abs_step_conf": 0.770302414894104, "adv/ratio_final_to_reasoning": 1.23325140146661, "adv/ratio_step_to_reasoning": 1.3809271889170471, "adv/std_final_conf": 0.8792401552200317, "adv/std_reasoning": 0.8098664283752441, "adv/std_step_conf": 0.9363515377044678, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.4586881868131868, "calib/avg_num_step_conf": 6.328125, "calib/ece": 0.4464049586776861, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9958677685950413, "calib/gap": 4.3956043956017155e-05, "calib/mean_conf": 0.9835950413223141, "calib/mu_c": 0.9836153846153847, "calib/mu_w": 0.9835714285714287, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4464049586776861, "calib/std_conf": 0.01568969684161436, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5907673267326732, "calib/step_q_c_n": 808.0, "calib/step_q_gap": 0.013021021313953995, "calib/step_q_w": 0.5777463054187192, "calib/step_q_w_n": 812.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2792.0, "completions/max_terminated_length": 2792.0, "completions/mean_length": 605.82421875, "completions/mean_terminated_length": 615.4404907226562, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.0256, "grad_norm": 0.023386143147945404, "learning_rate": 4.888888888888889e-06, "loss": -0.0244, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.029682423919439316, "mask/share_reasoning": 0.8312495946884155, "mask/share_step_conf": 0.12344293296337128, "num_tokens": 5700944.0, "reward": 1.157957911491394, "reward_std": 0.3184521794319153, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.521700382232666, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.75179523229599, "step": 24 }, { "adv/mean_abs_final_conf": 0.6561360359191895, "adv/mean_abs_reasoning": 0.39023175835609436, "adv/mean_abs_step_conf": 0.7360990047454834, "adv/ratio_final_to_reasoning": 1.681400915915337, "adv/ratio_step_to_reasoning": 1.8863124002167404, "adv/std_final_conf": 0.8390585780143738, "adv/std_reasoning": 0.6613587737083435, "adv/std_step_conf": 0.9361077547073364, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5486625845955526, "calib/avg_num_step_conf": 6.02734375, "calib/ece": 0.41007936507936527, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920634920634921, "calib/gap": 0.005144698678698156, "calib/mean_conf": 0.9775396825396826, "calib/mu_c": 0.9797241379310345, "calib/mu_w": 0.9745794392523364, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4061111111111113, "calib/std_conf": 0.08808983967443393, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.597260606060606, "calib/step_q_c_n": 825.0, "calib/step_q_gap": 0.03227453363720778, "calib/step_q_w": 0.5649860724233983, "calib/step_q_w_n": 718.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2682.0, "completions/max_terminated_length": 2682.0, "completions/mean_length": 532.88671875, "completions/mean_terminated_length": 532.88671875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.02666666666666667, "grad_norm": 0.02353591099381447, "learning_rate": 4.861111111111111e-06, "loss": 0.061, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030907901003956795, "mask/share_reasoning": 0.8425581455230713, "mask/share_step_conf": 0.12653397023677826, "num_tokens": 5940587.0, "reward": 1.2288556098937988, "reward_std": 0.25345277786254883, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5795210599899292, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7840168476104736, "step": 25 }, { "adv/mean_abs_final_conf": 0.6244549751281738, "adv/mean_abs_reasoning": 0.38389983773231506, "adv/mean_abs_step_conf": 0.7507979869842529, "adv/ratio_final_to_reasoning": 1.6266091145461556, "adv/ratio_step_to_reasoning": 1.9557132178518082, "adv/std_final_conf": 0.8316320776939392, "adv/std_reasoning": 0.6815283298492432, "adv/std_step_conf": 0.935912013053894, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4784319045035269, "calib/avg_num_step_conf": 5.43359375, "calib/ece": 0.37377510040160655, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0006918068366793539, "calib/mean_conf": 0.9842168674698797, "calib/mu_c": 0.9839473684210527, "calib/mu_w": 0.984639175257732, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.37377510040160655, "calib/std_conf": 0.010805377427554943, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6245371577574967, "calib/step_q_c_n": 767.0, "calib/step_q_gap": 0.040594850065189036, "calib/step_q_w": 0.5839423076923077, "calib/step_q_w_n": 624.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2556.0, "completions/max_terminated_length": 2556.0, "completions/mean_length": 546.08984375, "completions/mean_terminated_length": 550.3897705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 263.0, "epoch": 0.027733333333333332, "grad_norm": 0.04103153944015503, "learning_rate": 4.833333333333333e-06, "loss": 0.0347, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02897307462990284, "mask/share_reasoning": 0.853653073310852, "mask/share_step_conf": 0.10956136882305145, "num_tokens": 6185626.0, "reward": 1.2445653676986694, "reward_std": 0.2300690859556198, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6050347089767456, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.785407304763794, "step": 26 }, { "adv/mean_abs_final_conf": 0.5905928611755371, "adv/mean_abs_reasoning": 0.44203656911849976, "adv/mean_abs_step_conf": 0.7786446809768677, "adv/ratio_final_to_reasoning": 1.3360724031346214, "adv/ratio_step_to_reasoning": 1.7614938115405812, "adv/std_final_conf": 0.8065117597579956, "adv/std_reasoning": 0.7014051079750061, "adv/std_step_conf": 0.9358586668968201, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4908809794269848, "calib/avg_num_step_conf": 6.02734375, "calib/ece": 0.4384584980237154, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00021267196768914864, "calib/mean_conf": 0.9878656126482214, "calib/mu_c": 0.9877697841726619, "calib/mu_w": 0.9879824561403511, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4384584980237154, "calib/std_conf": 0.007290534995045996, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5710660486674393, "calib/step_q_c_n": 863.0, "calib/step_q_gap": 0.003007225138027536, "calib/step_q_w": 0.5680588235294117, "calib/step_q_w_n": 680.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2408.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 497.96875, "completions/mean_terminated_length": 499.9216003417969, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.0288, "grad_norm": 0.02568461187183857, "learning_rate": 4.805555555555556e-06, "loss": -0.0224, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.031828589737415314, "mask/share_reasoning": 0.8362923860549927, "mask/share_step_conf": 0.12797272205352783, "num_tokens": 6418322.0, "reward": 1.209911823272705, "reward_std": 0.24842794239521027, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5534738302230835, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7800499200820923, "step": 27 }, { "adv/mean_abs_final_conf": 0.5335990786552429, "adv/mean_abs_reasoning": 0.4032040238380432, "adv/mean_abs_step_conf": 0.7723286151885986, "adv/ratio_final_to_reasoning": 1.3233972061488555, "adv/ratio_step_to_reasoning": 1.9154784415019215, "adv/std_final_conf": 0.7860013842582703, "adv/std_reasoning": 0.701285183429718, "adv/std_step_conf": 0.9357082843780518, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5305719921104537, "calib/avg_num_step_conf": 5.26171875, "calib/ece": 0.35595141700404864, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.001831501831501714, "calib/mean_conf": 0.9875303643724697, "calib/mu_c": 0.988205128205128, "calib/mu_w": 0.9863736263736262, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.35595141700404864, "calib/std_conf": 0.007739044043112878, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6041807228915663, "calib/step_q_c_n": 830.0, "calib/step_q_gap": 0.026114958868355465, "calib/step_q_w": 0.5780657640232109, "calib/step_q_w_n": 517.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2583.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 580.2421875, "completions/mean_terminated_length": 584.81103515625, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.029866666666666666, "grad_norm": 0.03119593672454357, "learning_rate": 4.777777777777778e-06, "loss": -0.0502, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.028895732015371323, "mask/share_reasoning": 0.8566277027130127, "mask/share_step_conf": 0.10666404664516449, "num_tokens": 6673808.0, "reward": 1.2516157627105713, "reward_std": 0.24505168199539185, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6188547015190125, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7847665548324585, "step": 28 }, { "adv/mean_abs_final_conf": 0.5490765571594238, "adv/mean_abs_reasoning": 0.49996668100357056, "adv/mean_abs_step_conf": 0.747068464756012, "adv/ratio_final_to_reasoning": 1.0982262979150457, "adv/ratio_step_to_reasoning": 1.4942365024334026, "adv/std_final_conf": 0.7831160426139832, "adv/std_reasoning": 0.7393505573272705, "adv/std_step_conf": 0.935847818851471, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5112689393939395, "calib/avg_num_step_conf": 5.92578125, "calib/ece": 0.46464285714285714, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0010227272727271197, "calib/mean_conf": 0.988452380952381, "calib/mu_c": 0.9889393939393939, "calib/mu_w": 0.9879166666666668, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.46464285714285714, "calib/std_conf": 0.007318282328239851, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5883853459972863, "calib/step_q_c_n": 737.0, "calib/step_q_gap": 0.08215457676651705, "calib/step_q_w": 0.5062307692307693, "calib/step_q_w_n": 780.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2202.0, "completions/max_terminated_length": 2202.0, "completions/mean_length": 596.0078125, "completions/mean_terminated_length": 598.3451538085938, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.030933333333333334, "grad_norm": 0.027221517637372017, "learning_rate": 4.75e-06, "loss": -0.067, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02765384130179882, "mask/share_reasoning": 0.8585842847824097, "mask/share_step_conf": 0.10985566675662994, "num_tokens": 6933514.0, "reward": 1.2175838947296143, "reward_std": 0.25506746768951416, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5267691016197205, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8041993379592896, "step": 29 }, { "adv/mean_abs_final_conf": 0.6431605815887451, "adv/mean_abs_reasoning": 0.5578954219818115, "adv/mean_abs_step_conf": 0.7405064702033997, "adv/ratio_final_to_reasoning": 1.1528335889619712, "adv/ratio_step_to_reasoning": 1.3273212882315812, "adv/std_final_conf": 0.8330176472663879, "adv/std_reasoning": 0.775515079498291, "adv/std_step_conf": 0.9361404776573181, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5194967345370727, "calib/avg_num_step_conf": 6.26171875, "calib/ece": 0.44262948207171326, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0014925086438725144, "calib/mean_conf": 0.9884462151394423, "calib/mu_c": 0.9891240875912408, "calib/mu_w": 0.9876315789473683, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.44262948207171326, "calib/std_conf": 0.006404064914397626, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5581136638452237, "calib/step_q_c_n": 827.0, "calib/step_q_gap": 0.01882242673182155, "calib/step_q_w": 0.5392912371134021, "calib/step_q_w_n": 776.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2570.0, "completions/max_terminated_length": 2570.0, "completions/mean_length": 637.87109375, "completions/mean_terminated_length": 640.37255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.032, "grad_norm": 0.03011404164135456, "learning_rate": 4.722222222222222e-06, "loss": 0.0648, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.02600308507680893, "mask/share_reasoning": 0.8624289035797119, "mask/share_step_conf": 0.10766175389289856, "num_tokens": 7203793.0, "reward": 1.2001268863677979, "reward_std": 0.3211213946342468, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5420949459075928, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7775168418884277, "step": 30 }, { "adv/mean_abs_final_conf": 0.5998179912567139, "adv/mean_abs_reasoning": 0.46080482006073, "adv/mean_abs_step_conf": 0.7407785654067993, "adv/ratio_final_to_reasoning": 1.3016747333017549, "adv/ratio_step_to_reasoning": 1.607575557280784, "adv/std_final_conf": 0.8259286284446716, "adv/std_reasoning": 0.7392586469650269, "adv/std_step_conf": 0.9360550045967102, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5241233766233766, "calib/avg_num_step_conf": 6.3515625, "calib/ece": 0.5478800000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0011883116883115719, "calib/mean_conf": 0.9878800000000001, "calib/mu_c": 0.9885454545454544, "calib/mu_w": 0.9873571428571428, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.5478800000000001, "calib/std_conf": 0.007893389639438817, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5275705329153606, "calib/step_q_c_n": 638.0, "calib/step_q_gap": 0.02370413615422695, "calib/step_q_w": 0.5038663967611337, "calib/step_q_w_n": 988.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2476.0, "completions/max_terminated_length": 2476.0, "completions/mean_length": 652.5, "completions/mean_terminated_length": 652.5, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.03306666666666667, "grad_norm": 0.028559528291225433, "learning_rate": 4.694444444444445e-06, "loss": -0.0636, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02662399224936962, "mask/share_reasoning": 0.862493634223938, "mask/share_step_conf": 0.11088240146636963, "num_tokens": 7476745.0, "reward": 1.1457479000091553, "reward_std": 0.25158950686454773, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.44331127405166626, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7834672927856445, "step": 31 }, { "adv/mean_abs_final_conf": 0.5406360626220703, "adv/mean_abs_reasoning": 0.5045670866966248, "adv/mean_abs_step_conf": 0.7292282581329346, "adv/ratio_final_to_reasoning": 1.0714849955069152, "adv/ratio_step_to_reasoning": 1.4452553037241394, "adv/std_final_conf": 0.8152623176574707, "adv/std_reasoning": 0.7926533222198486, "adv/std_step_conf": 0.936184287071228, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5180899608865711, "calib/avg_num_step_conf": 6.00390625, "calib/ece": 0.46423387096774205, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0010417209908737401, "calib/mean_conf": 0.9884274193548388, "calib/mu_c": 0.9889230769230769, "calib/mu_w": 0.9878813559322032, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.46423387096774205, "calib/std_conf": 0.005494386265730213, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5454992764109985, "calib/step_q_c_n": 691.0, "calib/step_q_gap": 0.07904537570177861, "calib/step_q_w": 0.4664539007092199, "calib/step_q_w_n": 846.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2560.0, "completions/max_terminated_length": 2560.0, "completions/mean_length": 589.296875, "completions/mean_terminated_length": 596.2846069335938, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.034133333333333335, "grad_norm": 0.026078954339027405, "learning_rate": 4.666666666666667e-06, "loss": -0.0358, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.027657821774482727, "mask/share_reasoning": 0.8504782915115356, "mask/share_step_conf": 0.11014510691165924, "num_tokens": 7734309.0, "reward": 1.2053842544555664, "reward_std": 0.2780751585960388, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5188254117965698, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7983152866363525, "step": 32 }, { "adv/mean_abs_final_conf": 0.5705288052558899, "adv/mean_abs_reasoning": 0.4791930019855499, "adv/mean_abs_step_conf": 0.7540302276611328, "adv/ratio_final_to_reasoning": 1.1906033746150038, "adv/ratio_step_to_reasoning": 1.5735418183003236, "adv/std_final_conf": 0.80317622423172, "adv/std_reasoning": 0.739323616027832, "adv/std_step_conf": 0.9357584714889526, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5451496388028895, "calib/avg_num_step_conf": 5.84765625, "calib/ece": 0.53152, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0016563467492261008, "calib/mean_conf": 0.98752, "calib/mu_c": 0.988421052631579, "calib/mu_w": 0.9867647058823529, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.53152, "calib/std_conf": 0.008015584819587408, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5033039647577092, "calib/step_q_c_n": 681.0, "calib/step_q_gap": 0.01259318044398372, "calib/step_q_w": 0.4907107843137255, "calib/step_q_w_n": 816.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2599.0, "completions/max_terminated_length": 2599.0, "completions/mean_length": 589.0625, "completions/mean_terminated_length": 593.7008056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.0352, "grad_norm": 0.036052510142326355, "learning_rate": 4.638888888888889e-06, "loss": -0.0546, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.027194002643227577, "mask/share_reasoning": 0.8590854406356812, "mask/share_step_conf": 0.10590802878141403, "num_tokens": 7991981.0, "reward": 1.1576623916625977, "reward_std": 0.2627679109573364, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.4591601490974426, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7858948707580566, "step": 33 }, { "adv/mean_abs_final_conf": 0.5254691243171692, "adv/mean_abs_reasoning": 0.44810813665390015, "adv/mean_abs_step_conf": 0.7442312240600586, "adv/ratio_final_to_reasoning": 1.172639105018125, "adv/ratio_step_to_reasoning": 1.6608295256974355, "adv/std_final_conf": 0.7536927461624146, "adv/std_reasoning": 0.7013894319534302, "adv/std_step_conf": 0.935812771320343, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.47804033156006437, "calib/avg_num_step_conf": 6.46875, "calib/ece": 0.449843137254902, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.00027712482989006304, "calib/mean_conf": 0.9870980392156862, "calib/mu_c": 0.9872262773722628, "calib/mu_w": 0.9869491525423727, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.449843137254902, "calib/std_conf": 0.011384983060706762, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44419434194341945, "calib/step_q_c_n": 813.0, "calib/step_q_gap": 0.033755433283870206, "calib/step_q_w": 0.41043890865954924, "calib/step_q_w_n": 843.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2204.0, "completions/max_terminated_length": 2204.0, "completions/mean_length": 520.625, "completions/mean_terminated_length": 522.6666870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.03626666666666667, "grad_norm": 0.02863895334303379, "learning_rate": 4.611111111111112e-06, "loss": 0.0052, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.030690640211105347, "mask/share_reasoning": 0.8315275311470032, "mask/share_step_conf": 0.13387557864189148, "num_tokens": 8230373.0, "reward": 1.257469654083252, "reward_std": 0.1998538076877594, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5468925833702087, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8308983445167542, "step": 34 }, { "adv/mean_abs_final_conf": 0.7124243974685669, "adv/mean_abs_reasoning": 0.6191345453262329, "adv/mean_abs_step_conf": 0.7590646743774414, "adv/ratio_final_to_reasoning": 1.1506778338352577, "adv/ratio_step_to_reasoning": 1.2260092416220725, "adv/std_final_conf": 0.886207103729248, "adv/std_reasoning": 0.8266004323959351, "adv/std_step_conf": 0.9362123012542725, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5425914249684741, "calib/avg_num_step_conf": 5.33984375, "calib/ece": 0.4697619047619047, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0026607818411098494, "calib/mean_conf": 0.9856349206349206, "calib/mu_c": 0.986923076923077, "calib/mu_w": 0.9842622950819672, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4697619047619047, "calib/std_conf": 0.01076414033061597, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4666979655712051, "calib/step_q_c_n": 639.0, "calib/step_q_gap": 0.03360730623054581, "calib/step_q_w": 0.43309065934065927, "calib/step_q_w_n": 728.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2906.0, "completions/max_terminated_length": 2906.0, "completions/mean_length": 607.5390625, "completions/mean_terminated_length": 609.921630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 226.0, "epoch": 0.037333333333333336, "grad_norm": 0.02912158891558647, "learning_rate": 4.583333333333333e-06, "loss": 0.0197, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.027022257447242737, "mask/share_reasoning": 0.8741253018379211, "mask/share_step_conf": 0.09494614601135254, "num_tokens": 8495159.0, "reward": 1.1994110345840454, "reward_std": 0.2870052456855774, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5224952697753906, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7889447212219238, "step": 35 }, { "adv/mean_abs_final_conf": 0.5671288967132568, "adv/mean_abs_reasoning": 0.31426718831062317, "adv/mean_abs_step_conf": 0.7686116695404053, "adv/ratio_final_to_reasoning": 1.8046074098983058, "adv/ratio_step_to_reasoning": 2.445726751405896, "adv/std_final_conf": 0.759645402431488, "adv/std_reasoning": 0.5961850881576538, "adv/std_step_conf": 0.9357486963272095, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5037545787545787, "calib/avg_num_step_conf": 5.73046875, "calib/ece": 0.20880478087649412, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0011272893772892312, "calib/mean_conf": 0.9856972111553786, "calib/mu_c": 0.9859487179487179, "calib/mu_w": 0.9848214285714286, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20880478087649412, "calib/std_conf": 0.01005026967345645, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4283938294010889, "calib/step_q_c_n": 1102.0, "calib/step_q_gap": 0.044585610223006666, "calib/step_q_w": 0.3838082191780822, "calib/step_q_w_n": 365.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2356.0, "completions/max_terminated_length": 2356.0, "completions/mean_length": 548.8671875, "completions/mean_terminated_length": 548.8671875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.0384, "grad_norm": 0.07427214831113815, "learning_rate": 4.555555555555556e-06, "loss": 0.0355, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03173261880874634, "mask/share_reasoning": 0.8426610827445984, "mask/share_step_conf": 0.12560628354549408, "num_tokens": 8738381.0, "reward": 1.3560553789138794, "reward_std": 0.18213136494159698, "rewards/accuracy_reward_step": 0.76171875, "rewards/final_brier_reward_step": 0.7680597305297852, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7978066205978394, "step": 36 }, { "adv/mean_abs_final_conf": 0.5322921276092529, "adv/mean_abs_reasoning": 0.4058647155761719, "adv/mean_abs_step_conf": 0.7570114135742188, "adv/ratio_final_to_reasoning": 1.3115013628459984, "adv/ratio_step_to_reasoning": 1.8651816344753043, "adv/std_final_conf": 0.7714137434959412, "adv/std_reasoning": 0.6817934513092041, "adv/std_step_conf": 0.9359712600708008, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5140953528860318, "calib/avg_num_step_conf": 6.01953125, "calib/ece": 0.5228163265306123, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959183673469387, "calib/gap": -0.0005852417302797397, "calib/mean_conf": 0.9869795918367347, "calib/mu_c": 0.9866666666666667, "calib/mu_w": 0.9872519083969464, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.5222448979591837, "calib/std_conf": 0.011059191017242254, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4620941558441558, "calib/step_q_c_n": 616.0, "calib/step_q_gap": 0.06104550719550711, "calib/step_q_w": 0.4010486486486487, "calib/step_q_w_n": 925.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2955.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 612.32421875, "completions/mean_terminated_length": 614.7255249023438, "completions/min_length": 0.0, "completions/min_terminated_length": 208.0, "epoch": 0.039466666666666664, "grad_norm": 0.03133146092295647, "learning_rate": 4.527777777777778e-06, "loss": 0.0536, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.028429534286260605, "mask/share_reasoning": 0.854699969291687, "mask/share_step_conf": 0.11296417564153671, "num_tokens": 9002232.0, "reward": 1.170437216758728, "reward_std": 0.24616439640522003, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.4580800533294678, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8011627793312073, "step": 37 }, { "adv/mean_abs_final_conf": 0.5607985854148865, "adv/mean_abs_reasoning": 0.4902248978614807, "adv/mean_abs_step_conf": 0.7547855377197266, "adv/ratio_final_to_reasoning": 1.1439618588555396, "adv/ratio_step_to_reasoning": 1.5396719771116172, "adv/std_final_conf": 0.8122161030769348, "adv/std_reasoning": 0.7575083374977112, "adv/std_step_conf": 0.935595691204071, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5287289983326919, "calib/avg_num_step_conf": 5.52734375, "calib/ece": 0.4382071713147411, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.0012055918943183475, "calib/mean_conf": 0.98800796812749, "calib/mu_c": 0.9885507246376812, "calib/mu_w": 0.9873451327433629, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4382071713147411, "calib/std_conf": 0.009060219970361357, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5019422572178478, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.017577785548628877, "calib/step_q_w": 0.4843644716692189, "calib/step_q_w_n": 653.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 557.86328125, "completions/mean_terminated_length": 557.86328125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.04053333333333333, "grad_norm": 0.03880874812602997, "learning_rate": 4.5e-06, "loss": 0.0294, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030267592519521713, "mask/share_reasoning": 0.8548325300216675, "mask/share_step_conf": 0.11489984393119812, "num_tokens": 9251933.0, "reward": 1.242781400680542, "reward_std": 0.22920340299606323, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5500128269195557, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8158218264579773, "step": 38 }, { "adv/mean_abs_final_conf": 0.5456191301345825, "adv/mean_abs_reasoning": 0.45056378841400146, "adv/mean_abs_step_conf": 0.7524350881576538, "adv/ratio_final_to_reasoning": 1.210969776455358, "adv/ratio_step_to_reasoning": 1.6699857101393982, "adv/std_final_conf": 0.7808905839920044, "adv/std_reasoning": 0.720582127571106, "adv/std_step_conf": 0.9357057213783264, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5267491010030914, "calib/avg_num_step_conf": 5.6796875, "calib/ece": 0.4690873015873016, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0012775219229070878, "calib/mean_conf": 0.9889285714285715, "calib/mu_c": 0.9895419847328244, "calib/mu_w": 0.9882644628099173, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4690873015873016, "calib/std_conf": 0.005274494742113267, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5766857142857144, "calib/step_q_c_n": 700.0, "calib/step_q_gap": 0.08383425539977268, "calib/step_q_w": 0.4928514588859417, "calib/step_q_w_n": 754.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2464.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 536.8359375, "completions/mean_terminated_length": 541.06298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.0416, "grad_norm": 0.026082059368491173, "learning_rate": 4.472222222222223e-06, "loss": -0.0718, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.029997561126947403, "mask/share_reasoning": 0.8474767208099365, "mask/share_step_conf": 0.1147131398320198, "num_tokens": 9495451.0, "reward": 1.2392897605895996, "reward_std": 0.24177560210227966, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5226644277572632, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8283482193946838, "step": 39 }, { "adv/mean_abs_final_conf": 0.5052137970924377, "adv/mean_abs_reasoning": 0.43444058299064636, "adv/mean_abs_step_conf": 0.7465513348579407, "adv/ratio_final_to_reasoning": 1.1629065443531899, "adv/ratio_step_to_reasoning": 1.7184198808471218, "adv/std_final_conf": 0.7265942096710205, "adv/std_reasoning": 0.6817202568054199, "adv/std_step_conf": 0.9355455040931702, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5258540517081034, "calib/avg_num_step_conf": 5.140625, "calib/ece": 0.4883070866141732, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0018110236220473253, "calib/mean_conf": 0.9883070866141732, "calib/mu_c": 0.9892125984251967, "calib/mu_w": 0.9874015748031494, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4883070866141732, "calib/std_conf": 0.006265782354986256, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.607751572327044, "calib/step_q_c_n": 636.0, "calib/step_q_gap": 0.029222160562338062, "calib/step_q_w": 0.578529411764706, "calib/step_q_w_n": 680.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2583.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 553.20703125, "completions/mean_terminated_length": 553.20703125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.042666666666666665, "grad_norm": 0.039314769208431244, "learning_rate": 4.444444444444444e-06, "loss": 0.0366, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03152313828468323, "mask/share_reasoning": 0.8555469512939453, "mask/share_step_conf": 0.11292991787195206, "num_tokens": 9743832.0, "reward": 1.178615689277649, "reward_std": 0.24467621743679047, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5084191560745239, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.775577962398529, "step": 40 }, { "adv/mean_abs_final_conf": 0.5695716142654419, "adv/mean_abs_reasoning": 0.4441681206226349, "adv/mean_abs_step_conf": 0.7738885879516602, "adv/ratio_final_to_reasoning": 1.2823333954427354, "adv/ratio_step_to_reasoning": 1.742332580885866, "adv/std_final_conf": 0.7791482210159302, "adv/std_reasoning": 0.7014320492744446, "adv/std_step_conf": 0.9351321458816528, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5342717717717718, "calib/avg_num_step_conf": 5.21484375, "calib/ece": 0.2792519685039371, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.002585585585585526, "calib/mean_conf": 0.9879133858267717, "calib/mu_c": 0.9886666666666666, "calib/mu_w": 0.9860810810810811, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2792519685039371, "calib/std_conf": 0.008463468157947225, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6133402922755742, "calib/step_q_c_n": 958.0, "calib/step_q_gap": 0.011828355936051693, "calib/step_q_w": 0.6015119363395225, "calib/step_q_w_n": 377.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 507.20703125, "completions/mean_terminated_length": 509.19610595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.04373333333333333, "grad_norm": 0.032360244542360306, "learning_rate": 4.416666666666667e-06, "loss": 0.0269, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03386985510587692, "mask/share_reasoning": 0.8410245180130005, "mask/share_step_conf": 0.1211993470788002, "num_tokens": 9980925.0, "reward": 1.3508212566375732, "reward_std": 0.23536597192287445, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7109558582305908, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8258119821548462, "step": 41 }, { "adv/mean_abs_final_conf": 0.3804103136062622, "adv/mean_abs_reasoning": 0.2688877284526825, "adv/mean_abs_step_conf": 0.7513008117675781, "adv/ratio_final_to_reasoning": 1.4147552058077835, "adv/ratio_step_to_reasoning": 2.7941059865057705, "adv/std_final_conf": 0.6370423436164856, "adv/std_reasoning": 0.5482489466667175, "adv/std_step_conf": 0.9348362684249878, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.4952256131810522, "calib/avg_num_step_conf": 5.73046875, "calib/ece": 0.4145703125000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -3.182924545952481e-05, "calib/mean_conf": 0.9887890625000001, "calib/mu_c": 0.9887755102040816, "calib/mu_w": 0.9888073394495411, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4145703125000001, "calib/std_conf": 0.006099631576668688, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6229291716686675, "calib/step_q_c_n": 833.0, "calib/step_q_gap": 0.009916553372137482, "calib/step_q_w": 0.61301261829653, "calib/step_q_w_n": 634.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 441.42578125, "completions/mean_terminated_length": 443.1568908691406, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.0448, "grad_norm": 0.02571403607726097, "learning_rate": 4.388888888888889e-06, "loss": 0.0051, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034706950187683105, "mask/share_reasoning": 0.8270288705825806, "mask/share_step_conf": 0.13435789942741394, "num_tokens": 10198298.0, "reward": 1.2550263404846191, "reward_std": 0.15356135368347168, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.5835870504379272, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.805810809135437, "step": 42 }, { "adv/mean_abs_final_conf": 0.5298599600791931, "adv/mean_abs_reasoning": 0.49790188670158386, "adv/mean_abs_step_conf": 0.7524062395095825, "adv/ratio_final_to_reasoning": 1.0641854835885836, "adv/ratio_step_to_reasoning": 1.5111536220398682, "adv/std_final_conf": 0.7560086846351624, "adv/std_reasoning": 0.7393122911453247, "adv/std_step_conf": 0.935416579246521, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4993055555555555, "calib/avg_num_step_conf": 5.375, "calib/ece": 0.42125984251968507, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0008143939393943089, "calib/mean_conf": 0.9881889763779528, "calib/mu_c": 0.9885416666666669, "calib/mu_w": 0.9877272727272726, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42125984251968507, "calib/std_conf": 0.007144991399701208, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6314912280701753, "calib/step_q_c_n": 798.0, "calib/step_q_gap": 0.029743823225884647, "calib/step_q_w": 0.6017474048442907, "calib/step_q_w_n": 578.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2407.0, "completions/max_terminated_length": 2407.0, "completions/mean_length": 523.609375, "completions/mean_terminated_length": 523.609375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.04586666666666667, "grad_norm": 0.033340878784656525, "learning_rate": 4.361111111111112e-06, "loss": 0.0667, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03283454850316048, "mask/share_reasoning": 0.84957355260849, "mask/share_step_conf": 0.11759185045957565, "num_tokens": 10437566.0, "reward": 1.2455248832702637, "reward_std": 0.24735292792320251, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5728577971458435, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8036272525787354, "step": 43 }, { "adv/mean_abs_final_conf": 0.510025143623352, "adv/mean_abs_reasoning": 0.33438119292259216, "adv/mean_abs_step_conf": 0.7807687520980835, "adv/ratio_final_to_reasoning": 1.5252805911886937, "adv/ratio_step_to_reasoning": 2.3349661064186353, "adv/std_final_conf": 0.7340036034584045, "adv/std_reasoning": 0.5961487889289856, "adv/std_step_conf": 0.9347970485687256, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5302207968422351, "calib/avg_num_step_conf": 5.71484375, "calib/ece": 0.5129803921568628, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0017879610213397124, "calib/mean_conf": 0.9874901960784314, "calib/mu_c": 0.9884297520661157, "calib/mu_w": 0.986641791044776, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5129803921568628, "calib/std_conf": 0.007869954154436087, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6270942812982999, "calib/step_q_c_n": 647.0, "calib/step_q_gap": 0.06045212443555481, "calib/step_q_w": 0.5666421568627451, "calib/step_q_w_n": 816.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2108.0, "completions/max_terminated_length": 2108.0, "completions/mean_length": 542.76171875, "completions/mean_terminated_length": 542.76171875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.046933333333333334, "grad_norm": 0.04081565886735916, "learning_rate": 4.333333333333334e-06, "loss": 0.0692, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.029616590589284897, "mask/share_reasoning": 0.8555626273155212, "mask/share_step_conf": 0.11482080817222595, "num_tokens": 10682833.0, "reward": 1.1988677978515625, "reward_std": 0.18884721398353577, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.4864230155944824, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8087812662124634, "step": 44 }, { "adv/mean_abs_final_conf": 0.6177642345428467, "adv/mean_abs_reasoning": 0.5329394340515137, "adv/mean_abs_step_conf": 0.7456905245780945, "adv/ratio_final_to_reasoning": 1.1591640533080423, "adv/ratio_step_to_reasoning": 1.3992031306619663, "adv/std_final_conf": 0.8467596173286438, "adv/std_reasoning": 0.7926696538925171, "adv/std_step_conf": 0.9357033371925354, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5511443102352193, "calib/avg_num_step_conf": 5.52734375, "calib/ece": 0.4658964143426296, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.010575969485060344, "calib/mean_conf": 0.9838247011952193, "calib/mu_c": 0.9889230769230769, "calib/mu_w": 0.9783471074380166, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4658964143426296, "calib/std_conf": 0.06266741043070152, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5720665742024966, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.03484755402958595, "calib/step_q_w": 0.5372190201729107, "calib/step_q_w_n": 694.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2385.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 547.453125, "completions/mean_terminated_length": 547.453125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.048, "grad_norm": 0.02756119892001152, "learning_rate": 4.305555555555556e-06, "loss": 0.0009, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03192518651485443, "mask/share_reasoning": 0.8486030101776123, "mask/share_step_conf": 0.11947186291217804, "num_tokens": 10928029.0, "reward": 1.2208482027053833, "reward_std": 0.25765717029571533, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5241742134094238, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8099329471588135, "step": 45 }, { "adv/mean_abs_final_conf": 0.5813736319541931, "adv/mean_abs_reasoning": 0.4622410535812378, "adv/mean_abs_step_conf": 0.7636514902114868, "adv/ratio_final_to_reasoning": 1.2577282512013357, "adv/ratio_step_to_reasoning": 1.6520633212802178, "adv/std_final_conf": 0.7970679402351379, "adv/std_reasoning": 0.7206328511238098, "adv/std_step_conf": 0.9353775978088379, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5165275569687335, "calib/avg_num_step_conf": 6.234375, "calib/ece": 0.435425101214575, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006478537360893322, "calib/mean_conf": 0.9860323886639677, "calib/mu_c": 0.9863235294117648, "calib/mu_w": 0.9856756756756755, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.435425101214575, "calib/std_conf": 0.010706186657994667, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5459429280397022, "calib/step_q_c_n": 806.0, "calib/step_q_gap": 0.10791761158400609, "calib/step_q_w": 0.43802531645569615, "calib/step_q_w_n": 790.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2980.0, "completions/max_terminated_length": 2980.0, "completions/mean_length": 597.453125, "completions/mean_terminated_length": 602.157470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.04906666666666667, "grad_norm": 0.025706231594085693, "learning_rate": 4.277777777777778e-06, "loss": -0.0665, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.031699288636446, "mask/share_reasoning": 0.8346951007843018, "mask/share_step_conf": 0.12579315900802612, "num_tokens": 11185745.0, "reward": 1.2327275276184082, "reward_std": 0.2366017997264862, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5433730483055115, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8114316463470459, "step": 46 }, { "adv/mean_abs_final_conf": 0.5593074560165405, "adv/mean_abs_reasoning": 0.4556795358657837, "adv/mean_abs_step_conf": 0.7513096332550049, "adv/ratio_final_to_reasoning": 1.227414031121379, "adv/ratio_step_to_reasoning": 1.6487675528977372, "adv/std_final_conf": 0.7847610712051392, "adv/std_reasoning": 0.7206171154975891, "adv/std_step_conf": 0.9353805184364319, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.535280778978258, "calib/avg_num_step_conf": 5.48046875, "calib/ece": 0.37621513944223106, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.002446311858076533, "calib/mean_conf": 0.9857768924302789, "calib/mu_c": 0.9867320261437909, "calib/mu_w": 0.9842857142857143, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.37621513944223106, "calib/std_conf": 0.010547465467741988, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.49233173076923076, "calib/step_q_c_n": 832.0, "calib/step_q_gap": 0.05213908628586822, "calib/step_q_w": 0.44019264448336254, "calib/step_q_w_n": 571.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2700.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 576.8515625, "completions/mean_terminated_length": 576.8515625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.050133333333333335, "grad_norm": 0.0265414509922266, "learning_rate": 4.25e-06, "loss": -0.0368, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03034675121307373, "mask/share_reasoning": 0.8593940734863281, "mask/share_step_conf": 0.11025922000408173, "num_tokens": 11439395.0, "reward": 1.3010945320129395, "reward_std": 0.2340029627084732, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6093801259994507, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8385920524597168, "step": 47 }, { "adv/mean_abs_final_conf": 0.6615752577781677, "adv/mean_abs_reasoning": 0.5312079191207886, "adv/mean_abs_step_conf": 0.7640124559402466, "adv/ratio_final_to_reasoning": 1.245416783080253, "adv/ratio_step_to_reasoning": 1.4382550192489163, "adv/std_final_conf": 0.8361097574234009, "adv/std_reasoning": 0.7576334476470947, "adv/std_step_conf": 0.935871958732605, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5431520863041726, "calib/avg_num_step_conf": 5.5546875, "calib/ece": 0.4841732283464569, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.0037795275590550848, "calib/mean_conf": 0.9841732283464568, "calib/mu_c": 0.9860629921259841, "calib/mu_w": 0.982283464566929, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4841732283464569, "calib/std_conf": 0.013599498883883185, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5214637681159421, "calib/step_q_c_n": 690.0, "calib/step_q_gap": 0.06504300308861966, "calib/step_q_w": 0.45642076502732243, "calib/step_q_w_n": 732.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 526.75, "completions/mean_terminated_length": 528.8157348632812, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.0512, "grad_norm": 0.02948327362537384, "learning_rate": 4.222222222222223e-06, "loss": 0.0157, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03321732580661774, "mask/share_reasoning": 0.8371387720108032, "mask/share_step_conf": 0.12573771178722382, "num_tokens": 11677931.0, "reward": 1.2243374586105347, "reward_std": 0.2565346360206604, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5132398009300232, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8188894391059875, "step": 48 }, { "adv/mean_abs_final_conf": 0.6058684587478638, "adv/mean_abs_reasoning": 0.4402289390563965, "adv/mean_abs_step_conf": 0.782744824886322, "adv/ratio_final_to_reasoning": 1.3762576809387075, "adv/ratio_step_to_reasoning": 1.7780403681868056, "adv/std_final_conf": 0.7842022776603699, "adv/std_reasoning": 0.6817723512649536, "adv/std_step_conf": 0.9354808926582336, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5498268698060941, "calib/avg_num_step_conf": 5.78125, "calib/ece": 0.3696356275303644, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0026842105263159555, "calib/mean_conf": 0.9850202429149798, "calib/mu_c": 0.9860526315789475, "calib/mu_w": 0.9833684210526316, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3696356275303644, "calib/std_conf": 0.009810945466831327, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5215863689776734, "calib/step_q_c_n": 851.0, "calib/step_q_gap": 0.08287412732425531, "calib/step_q_w": 0.4387122416534181, "calib/step_q_w_n": 629.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2991.0, "completions/max_terminated_length": 2991.0, "completions/mean_length": 528.73828125, "completions/mean_terminated_length": 532.9015502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 207.0, "epoch": 0.05226666666666667, "grad_norm": 0.02996394783258438, "learning_rate": 4.194444444444445e-06, "loss": -0.0307, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.031121041625738144, "mask/share_reasoning": 0.8388256430625916, "mask/share_step_conf": 0.1222408264875412, "num_tokens": 11917824.0, "reward": 1.2675228118896484, "reward_std": 0.2394319772720337, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6055535078048706, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8092772364616394, "step": 49 }, { "adv/mean_abs_final_conf": 0.6307613849639893, "adv/mean_abs_reasoning": 0.38215214014053345, "adv/mean_abs_step_conf": 0.742942214012146, "adv/ratio_final_to_reasoning": 1.6505504449930117, "adv/ratio_step_to_reasoning": 1.9441006237435563, "adv/std_final_conf": 0.798173189163208, "adv/std_reasoning": 0.6613245010375977, "adv/std_step_conf": 0.9352914094924927, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5886194730125367, "calib/avg_num_step_conf": 5.75, "calib/ece": 0.29016000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.992, "calib/gap": 0.0062382704001202605, "calib/mean_conf": 0.9812000000000002, "calib/mu_c": 0.983121387283237, "calib/mu_w": 0.9768831168831167, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28968000000000005, "calib/std_conf": 0.015929846201391904, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5423921765295887, "calib/step_q_c_n": 997.0, "calib/step_q_gap": 0.07986586074011504, "calib/step_q_w": 0.4625263157894737, "calib/step_q_w_n": 475.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2902.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 541.9765625, "completions/mean_terminated_length": 548.4031982421875, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.05333333333333334, "grad_norm": 0.031243592500686646, "learning_rate": 4.166666666666667e-06, "loss": -0.0862, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03204977139830589, "mask/share_reasoning": 0.8309839963912964, "mask/share_step_conf": 0.12524744868278503, "num_tokens": 12161930.0, "reward": 1.345628261566162, "reward_std": 0.23302677273750305, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6890945434570312, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8358466625213623, "step": 50 }, { "adv/mean_abs_final_conf": 0.6398917436599731, "adv/mean_abs_reasoning": 0.3863760232925415, "adv/mean_abs_step_conf": 0.7543948888778687, "adv/ratio_final_to_reasoning": 1.6561372991188024, "adv/ratio_step_to_reasoning": 1.9524888797426352, "adv/std_final_conf": 0.8142772912979126, "adv/std_reasoning": 0.6612588167190552, "adv/std_step_conf": 0.9354052543640137, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.586183176100629, "calib/avg_num_step_conf": 5.84765625, "calib/ece": 0.36023529411764726, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9882352941176471, "calib/gap": 0.001495676100628951, "calib/mean_conf": 0.9787450980392157, "calib/mu_c": 0.979308176100629, "calib/mu_w": 0.9778125000000001, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35772549019607863, "calib/std_conf": 0.03563422189536946, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5560224719101123, "calib/step_q_c_n": 890.0, "calib/step_q_gap": 0.1265167058475093, "calib/step_q_w": 0.42950576606260304, "calib/step_q_w_n": 607.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2332.0, "completions/max_terminated_length": 2332.0, "completions/mean_length": 536.2578125, "completions/mean_terminated_length": 536.2578125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.0544, "grad_norm": 0.027206402271986008, "learning_rate": 4.138888888888889e-06, "loss": 0.0269, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.030329890549182892, "mask/share_reasoning": 0.8490919470787048, "mask/share_step_conf": 0.12057814747095108, "num_tokens": 12408508.0, "reward": 1.3251616954803467, "reward_std": 0.21376600861549377, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.635941743850708, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8458625674247742, "step": 51 }, { "adv/mean_abs_final_conf": 0.6179524064064026, "adv/mean_abs_reasoning": 0.42776966094970703, "adv/mean_abs_step_conf": 0.7270907163619995, "adv/ratio_final_to_reasoning": 1.4445914771853243, "adv/ratio_step_to_reasoning": 1.699724834967863, "adv/std_final_conf": 0.8091193437576294, "adv/std_reasoning": 0.7014753222465515, "adv/std_step_conf": 0.9353761672973633, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5595611285266457, "calib/avg_num_step_conf": 5.78125, "calib/ece": 0.29079681274900404, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.0013942379459623844, "calib/mean_conf": 0.9808366533864543, "calib/mu_c": 0.981264367816092, "calib/mu_w": 0.9798701298701297, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2892031872509961, "calib/std_conf": 0.027938770758738494, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6002297702297702, "calib/step_q_c_n": 1001.0, "calib/step_q_gap": 0.10457215018801652, "calib/step_q_w": 0.4956576200417537, "calib/step_q_w_n": 479.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3004.0, "completions/max_terminated_length": 3004.0, "completions/mean_length": 539.76953125, "completions/mean_terminated_length": 541.8862915039062, "completions/min_length": 0.0, "completions/min_terminated_length": 214.0, "epoch": 0.055466666666666664, "grad_norm": 0.030449653044342995, "learning_rate": 4.111111111111111e-06, "loss": 0.0344, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03135004639625549, "mask/share_reasoning": 0.842849612236023, "mask/share_step_conf": 0.12189410626888275, "num_tokens": 12654641.0, "reward": 1.3621165752410889, "reward_std": 0.23896150290966034, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.690671443939209, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8507653474807739, "step": 52 }, { "adv/mean_abs_final_conf": 0.6005265116691589, "adv/mean_abs_reasoning": 0.35591036081314087, "adv/mean_abs_step_conf": 0.7810007929801941, "adv/ratio_final_to_reasoning": 1.6872970775482588, "adv/ratio_step_to_reasoning": 2.194374985869639, "adv/std_final_conf": 0.7910946011543274, "adv/std_reasoning": 0.64017653465271, "adv/std_step_conf": 0.9354143142700195, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5703883495145631, "calib/avg_num_step_conf": 5.72265625, "calib/ece": 0.3865490196078431, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.004896525293817233, "calib/mean_conf": 0.9826274509803922, "calib/mu_c": 0.9846052631578949, "calib/mu_w": 0.9797087378640776, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3865490196078431, "calib/std_conf": 0.015958230753959433, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5978512396694214, "calib/step_q_c_n": 847.0, "calib/step_q_gap": 0.06527842413544083, "calib/step_q_w": 0.5325728155339806, "calib/step_q_w_n": 618.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 543.703125, "completions/mean_terminated_length": 543.703125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.05653333333333333, "grad_norm": 0.02810330130159855, "learning_rate": 4.083333333333334e-06, "loss": -0.0089, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.02985529601573944, "mask/share_reasoning": 0.8550183773040771, "mask/share_step_conf": 0.11512631177902222, "num_tokens": 12899653.0, "reward": 1.2754499912261963, "reward_std": 0.20352619886398315, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6095238327980042, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.811703622341156, "step": 53 }, { "adv/mean_abs_final_conf": 0.48447397351264954, "adv/mean_abs_reasoning": 0.3503273129463196, "adv/mean_abs_step_conf": 0.7499665021896362, "adv/ratio_final_to_reasoning": 1.382918075779279, "adv/ratio_step_to_reasoning": 2.140759439743007, "adv/std_final_conf": 0.7099431157112122, "adv/std_reasoning": 0.6401990056037903, "adv/std_step_conf": 0.9351658225059509, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6277173913043479, "calib/avg_num_step_conf": 5.8828125, "calib/ece": 0.251372549019608, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9764705882352941, "calib/gap": 0.04779929577464814, "calib/mean_conf": 0.9729411764705883, "calib/mu_c": 0.9862500000000002, "calib/mu_w": 0.938450704225352, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.251372549019608, "calib/std_conf": 0.09634688386236717, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5997388781431334, "calib/step_q_c_n": 1034.0, "calib/step_q_gap": 0.13666684424482833, "calib/step_q_w": 0.4630720338983051, "calib/step_q_w_n": 472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2310.0, "completions/max_terminated_length": 2310.0, "completions/mean_length": 483.65234375, "completions/mean_terminated_length": 483.65234375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.0576, "grad_norm": 0.030784349888563156, "learning_rate": 4.055555555555556e-06, "loss": 0.0105, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03423851728439331, "mask/share_reasoning": 0.8335533142089844, "mask/share_step_conf": 0.13220810890197754, "num_tokens": 13129700.0, "reward": 1.4109441041946411, "reward_std": 0.198542058467865, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7428370714187622, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8684317469596863, "step": 54 }, { "adv/mean_abs_final_conf": 0.5835695266723633, "adv/mean_abs_reasoning": 0.4531455636024475, "adv/mean_abs_step_conf": 0.7731038928031921, "adv/ratio_final_to_reasoning": 1.2878191326271904, "adv/ratio_step_to_reasoning": 1.7060828901360483, "adv/std_final_conf": 0.8207448124885559, "adv/std_reasoning": 0.7205977439880371, "adv/std_step_conf": 0.9359610676765442, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6237734165923283, "calib/avg_num_step_conf": 5.60546875, "calib/ece": 0.44613545816733063, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9800796812749004, "calib/gap": 0.02398814833694407, "calib/mean_conf": 0.9760159362549802, "calib/mu_c": 0.9872932330827068, "calib/mu_w": 0.9633050847457627, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44613545816733063, "calib/std_conf": 0.07534293274727284, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5862, "calib/step_q_c_n": 700.0, "calib/step_q_gap": 0.12093469387755101, "calib/step_q_w": 0.46526530612244904, "calib/step_q_w_n": 735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2808.0, "completions/max_terminated_length": 2808.0, "completions/mean_length": 525.796875, "completions/mean_terminated_length": 529.93701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.058666666666666666, "grad_norm": 0.03521295264363289, "learning_rate": 4.027777777777779e-06, "loss": -0.0709, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03193287178874016, "mask/share_reasoning": 0.8432319760322571, "mask/share_step_conf": 0.11702266335487366, "num_tokens": 13372128.0, "reward": 1.231523036956787, "reward_std": 0.2821798324584961, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5472296476364136, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8079081177711487, "step": 55 }, { "adv/mean_abs_final_conf": 0.5679474472999573, "adv/mean_abs_reasoning": 0.5074265599250793, "adv/mean_abs_step_conf": 0.7627795338630676, "adv/ratio_final_to_reasoning": 1.1192702395866188, "adv/ratio_step_to_reasoning": 1.503231391702655, "adv/std_final_conf": 0.786614716053009, "adv/std_reasoning": 0.7575352191925049, "adv/std_step_conf": 0.9355356097221375, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6151002506265665, "calib/avg_num_step_conf": 5.94140625, "calib/ece": 0.4554940711462451, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": 0.01192543859649109, "calib/mean_conf": 0.9811857707509883, "calib/mu_c": 0.9868421052631579, "calib/mu_w": 0.9749166666666668, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4554940711462451, "calib/std_conf": 0.033753598477446724, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.551680981595092, "calib/step_q_c_n": 815.0, "calib/step_q_gap": 0.05923055666591365, "calib/step_q_w": 0.4924504249291784, "calib/step_q_w_n": 706.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2259.0, "completions/max_terminated_length": 2259.0, "completions/mean_length": 535.55859375, "completions/mean_terminated_length": 537.6588745117188, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.05973333333333333, "grad_norm": 0.027493732050061226, "learning_rate": 4.000000000000001e-06, "loss": -0.0323, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03047093003988266, "mask/share_reasoning": 0.8427596688270569, "mask/share_step_conf": 0.12286314368247986, "num_tokens": 13616071.0, "reward": 1.2343281507492065, "reward_std": 0.2484326958656311, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5414933562278748, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8131908178329468, "step": 56 }, { "adv/mean_abs_final_conf": 0.5659802556037903, "adv/mean_abs_reasoning": 0.4423058032989502, "adv/mean_abs_step_conf": 0.7457073926925659, "adv/ratio_final_to_reasoning": 1.2796129993828043, "adv/ratio_step_to_reasoning": 1.6859543490740716, "adv/std_final_conf": 0.8015991449356079, "adv/std_reasoning": 0.7013233304023743, "adv/std_step_conf": 0.9359928965568542, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6317563102947006, "calib/avg_num_step_conf": 5.8203125, "calib/ece": 0.347843137254902, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.984313725490196, "calib/gap": 0.019145632350997976, "calib/mean_conf": 0.9792156862745097, "calib/mu_c": 0.986273291925466, "calib/mu_w": 0.967127659574468, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.347843137254902, "calib/std_conf": 0.05214020078839857, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5801402373247033, "calib/step_q_c_n": 927.0, "calib/step_q_gap": 0.085859597893087, "calib/step_q_w": 0.4942806394316163, "calib/step_q_w_n": 563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2627.0, "completions/max_terminated_length": 2627.0, "completions/mean_length": 521.7265625, "completions/mean_terminated_length": 521.7265625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.0608, "grad_norm": 0.02348768338561058, "learning_rate": 3.972222222222223e-06, "loss": 0.0309, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03096698224544525, "mask/share_reasoning": 0.8459857702255249, "mask/share_step_conf": 0.12304721772670746, "num_tokens": 13856425.0, "reward": 1.303645372390747, "reward_std": 0.2623915672302246, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6460027098655701, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8185346126556396, "step": 57 }, { "adv/mean_abs_final_conf": 0.6994427442550659, "adv/mean_abs_reasoning": 0.5924781560897827, "adv/mean_abs_step_conf": 0.7661586999893188, "adv/ratio_final_to_reasoning": 1.180537606434682, "adv/ratio_step_to_reasoning": 1.2931425270524521, "adv/std_final_conf": 0.8900883197784424, "adv/std_reasoning": 0.8265689611434937, "adv/std_step_conf": 0.9355772137641907, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5210778061224489, "calib/avg_num_step_conf": 7.28125, "calib/ece": 0.4255555555555556, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9880952380952381, "calib/gap": 0.000571428571428223, "calib/mean_conf": 0.9788888888888889, "calib/mu_c": 0.979142857142857, "calib/mu_w": 0.9785714285714288, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42444444444444446, "calib/std_conf": 0.023978238223214852, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5137659574468085, "calib/step_q_c_n": 940.0, "calib/step_q_gap": 0.02425297043382152, "calib/step_q_w": 0.489512987012987, "calib/step_q_w_n": 924.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2888.0, "completions/max_terminated_length": 2888.0, "completions/mean_length": 599.9921875, "completions/mean_terminated_length": 602.3451538085938, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.06186666666666667, "grad_norm": 0.02133341133594513, "learning_rate": 3.944444444444445e-06, "loss": 0.0349, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.027864959090948105, "mask/share_reasoning": 0.8431490063667297, "mask/share_step_conf": 0.12507976591587067, "num_tokens": 14116343.0, "reward": 1.2402286529541016, "reward_std": 0.30855613946914673, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5645425319671631, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8052229881286621, "step": 58 }, { "adv/mean_abs_final_conf": 0.5506974458694458, "adv/mean_abs_reasoning": 0.37096497416496277, "adv/mean_abs_step_conf": 0.7607747912406921, "adv/ratio_final_to_reasoning": 1.4844998429004206, "adv/ratio_step_to_reasoning": 2.050799520771971, "adv/std_final_conf": 0.7599132061004639, "adv/std_reasoning": 0.6403107047080994, "adv/std_step_conf": 0.9347589015960693, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5710547504025765, "calib/avg_num_step_conf": 6.234375, "calib/ece": 0.34059055118110243, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.984251968503937, "calib/gap": 0.015193236714975522, "calib/mean_conf": 0.9783858267716535, "calib/mu_c": 0.9838888888888888, "calib/mu_w": 0.9686956521739133, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34059055118110243, "calib/std_conf": 0.06270912066731307, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6074947589098533, "calib/step_q_c_n": 954.0, "calib/step_q_gap": 0.10428603616842025, "calib/step_q_w": 0.503208722741433, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2898.0, "completions/max_terminated_length": 2898.0, "completions/mean_length": 562.83984375, "completions/mean_terminated_length": 562.83984375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.06293333333333333, "grad_norm": 0.02957838773727417, "learning_rate": 3.916666666666667e-06, "loss": 0.0696, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.031655244529247284, "mask/share_reasoning": 0.8429268002510071, "mask/share_step_conf": 0.12541794776916504, "num_tokens": 14366678.0, "reward": 1.3069705963134766, "reward_std": 0.21562360227108002, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6509472727775574, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8189969062805176, "step": 59 }, { "adv/mean_abs_final_conf": 0.6753536462783813, "adv/mean_abs_reasoning": 0.5778477191925049, "adv/mean_abs_step_conf": 0.7475611567497253, "adv/ratio_final_to_reasoning": 1.1687398320480231, "adv/ratio_step_to_reasoning": 1.2936992427596343, "adv/std_final_conf": 0.8445501327514648, "adv/std_reasoning": 0.7929325699806213, "adv/std_step_conf": 0.9358199834823608, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6227464140018423, "calib/avg_num_step_conf": 6.0, "calib/ece": 0.3924302788844621, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9840637450199203, "calib/gap": 0.005486248190551213, "calib/mean_conf": 0.9701195219123507, "calib/mu_c": 0.9723489932885904, "calib/mu_w": 0.9668627450980392, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38446215139442225, "calib/std_conf": 0.10780121173883471, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.574626168224299, "calib/step_q_c_n": 856.0, "calib/step_q_gap": 0.08472910940076961, "calib/step_q_w": 0.4898970588235294, "calib/step_q_w_n": 680.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2702.0, "completions/max_terminated_length": 2702.0, "completions/mean_length": 532.43359375, "completions/mean_terminated_length": 536.6259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.064, "grad_norm": 0.031429000198841095, "learning_rate": 3.88888888888889e-06, "loss": -0.049, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03032679483294487, "mask/share_reasoning": 0.838133692741394, "mask/share_step_conf": 0.12372701615095139, "num_tokens": 14611837.0, "reward": 1.285160779953003, "reward_std": 0.31759822368621826, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5961679220199585, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8308267593383789, "step": 60 }, { "adv/mean_abs_final_conf": 0.5964518785476685, "adv/mean_abs_reasoning": 0.4378543496131897, "adv/mean_abs_step_conf": 0.7456066012382507, "adv/ratio_final_to_reasoning": 1.362215264218769, "adv/ratio_step_to_reasoning": 1.702864438589993, "adv/std_final_conf": 0.7979384660720825, "adv/std_reasoning": 0.7013479471206665, "adv/std_step_conf": 0.9356203675270081, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6158138268916713, "calib/avg_num_step_conf": 5.67578125, "calib/ece": 0.3293725490196078, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.007047495917256175, "calib/mean_conf": 0.9842745098039215, "calib/mu_c": 0.9867065868263472, "calib/mu_w": 0.979659090909091, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3293725490196078, "calib/std_conf": 0.013021970019589218, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6622426868905742, "calib/step_q_c_n": 923.0, "calib/step_q_gap": 0.11929929066415912, "calib/step_q_w": 0.5429433962264151, "calib/step_q_w_n": 530.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 459.84765625, "completions/mean_terminated_length": 459.84765625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.06506666666666666, "grad_norm": 0.026003647595643997, "learning_rate": 3.861111111111112e-06, "loss": -0.0255, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.036012012511491776, "mask/share_reasoning": 0.827039361000061, "mask/share_step_conf": 0.1369486153125763, "num_tokens": 14833622.0, "reward": 1.3126163482666016, "reward_std": 0.2654881477355957, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6656820178031921, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8153221607208252, "step": 61 }, { "adv/mean_abs_final_conf": 0.6631834506988525, "adv/mean_abs_reasoning": 0.5495285987854004, "adv/mean_abs_step_conf": 0.7788981199264526, "adv/ratio_final_to_reasoning": 1.2068224513968129, "adv/ratio_step_to_reasoning": 1.4173932378551688, "adv/std_final_conf": 0.8523333072662354, "adv/std_reasoning": 0.7753791213035583, "adv/std_step_conf": 0.9356539249420166, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.517351069982649, "calib/avg_num_step_conf": 6.375, "calib/ece": 0.44724, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.98, "calib/gap": 0.004370541738962785, "calib/mean_conf": 0.9741200000000001, "calib/mu_c": 0.9761654135338345, "calib/mu_w": 0.9717948717948717, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.44468, "calib/std_conf": 0.07066700503063647, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5814967741935484, "calib/step_q_c_n": 775.0, "calib/step_q_gap": 0.0856858056987993, "calib/step_q_w": 0.4958109684947491, "calib/step_q_w_n": 857.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3007.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 560.5859375, "completions/mean_terminated_length": 567.2332153320312, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.06613333333333334, "grad_norm": 0.03154468163847923, "learning_rate": 3.833333333333334e-06, "loss": -0.0568, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.029570816084742546, "mask/share_reasoning": 0.8387907147407532, "mask/share_step_conf": 0.11991971731185913, "num_tokens": 15084212.0, "reward": 1.1733200550079346, "reward_std": 0.3146999180316925, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5397816300392151, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7538197636604309, "step": 62 }, { "adv/mean_abs_final_conf": 0.6640254259109497, "adv/mean_abs_reasoning": 0.475774347782135, "adv/mean_abs_step_conf": 0.7512675523757935, "adv/ratio_final_to_reasoning": 1.3956730307263603, "adv/ratio_step_to_reasoning": 1.5790417366507774, "adv/std_final_conf": 0.8391026258468628, "adv/std_reasoning": 0.7393614053726196, "adv/std_step_conf": 0.9358152747154236, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6436034658511723, "calib/avg_num_step_conf": 6.12890625, "calib/ece": 0.4007905138339921, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9525691699604744, "calib/gap": 0.02507581549439386, "calib/mean_conf": 0.9699604743083005, "calib/mu_c": 0.980763888888889, "calib/mu_w": 0.9556880733944951, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4007905138339921, "calib/std_conf": 0.06619230591708436, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5627252252252253, "calib/step_q_c_n": 888.0, "calib/step_q_gap": 0.03624945430011517, "calib/step_q_w": 0.5264757709251101, "calib/step_q_w_n": 681.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2497.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 585.31640625, "completions/mean_terminated_length": 587.61181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.0672, "grad_norm": 0.03112921118736267, "learning_rate": 3.8055555555555556e-06, "loss": -0.0232, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02907847985625267, "mask/share_reasoning": 0.8485139608383179, "mask/share_step_conf": 0.11850129067897797, "num_tokens": 15342693.0, "reward": 1.2638373374938965, "reward_std": 0.2874062657356262, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5950124859809875, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8112529516220093, "step": 63 }, { "adv/mean_abs_final_conf": 0.6524174213409424, "adv/mean_abs_reasoning": 0.5324134826660156, "adv/mean_abs_step_conf": 0.7383209466934204, "adv/ratio_final_to_reasoning": 1.2253961302294922, "adv/ratio_step_to_reasoning": 1.3867435193345228, "adv/std_final_conf": 0.8418984413146973, "adv/std_reasoning": 0.7927736043930054, "adv/std_step_conf": 0.9360528588294983, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5773400119617225, "calib/avg_num_step_conf": 5.52734375, "calib/ece": 0.2762301587301588, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9722222222222222, "calib/gap": 0.028032296650717825, "calib/mean_conf": 0.97234126984127, "calib/mu_c": 0.9807954545454546, "calib/mu_w": 0.9527631578947368, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2750793650793652, "calib/std_conf": 0.07486810229493363, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6347746781115882, "calib/step_q_c_n": 932.0, "calib/step_q_gap": 0.08777674850496286, "calib/step_q_w": 0.5469979296066253, "calib/step_q_w_n": 483.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 513.578125, "completions/mean_terminated_length": 517.6220703125, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.06826666666666667, "grad_norm": 0.036455199122428894, "learning_rate": 3.777777777777778e-06, "loss": 0.0045, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031546011567115784, "mask/share_reasoning": 0.845130205154419, "mask/share_step_conf": 0.11551132053136826, "num_tokens": 15577945.0, "reward": 1.3307418823242188, "reward_std": 0.30955109000205994, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7089675664901733, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8098517656326294, "step": 64 }, { "adv/mean_abs_final_conf": 0.4953068494796753, "adv/mean_abs_reasoning": 0.295492023229599, "adv/mean_abs_step_conf": 0.7505620718002319, "adv/ratio_final_to_reasoning": 1.6762105591419605, "adv/ratio_step_to_reasoning": 2.5400417364804495, "adv/std_final_conf": 0.7653894424438477, "adv/std_reasoning": 0.5959096550941467, "adv/std_step_conf": 0.9354201555252075, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5859900373599003, "calib/avg_num_step_conf": 5.49609375, "calib/ece": 0.4091796875000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.96875, "calib/gap": 0.010745952677459591, "calib/mean_conf": 0.9794921875000001, "calib/mu_c": 0.9841095890410959, "calib/mu_w": 0.9733636363636363, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4091796875000001, "calib/std_conf": 0.028670677555035978, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.672340425531915, "calib/step_q_c_n": 799.0, "calib/step_q_gap": 0.03360687290033604, "calib/step_q_w": 0.6387335526315789, "calib/step_q_w_n": 608.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 444.41796875, "completions/mean_terminated_length": 446.1607971191406, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.06933333333333333, "grad_norm": 0.036643534898757935, "learning_rate": 3.7500000000000005e-06, "loss": -0.0162, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.036343321204185486, "mask/share_reasoning": 0.828540563583374, "mask/share_step_conf": 0.13120990991592407, "num_tokens": 15796740.0, "reward": 1.2438958883285522, "reward_std": 0.20089185237884521, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5919605493545532, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.7908843159675598, "step": 65 }, { "adv/mean_abs_final_conf": 0.6261056661605835, "adv/mean_abs_reasoning": 0.36768579483032227, "adv/mean_abs_step_conf": 0.7685141563415527, "adv/ratio_final_to_reasoning": 1.7028279986979522, "adv/ratio_step_to_reasoning": 2.090138284227713, "adv/std_final_conf": 0.8404014706611633, "adv/std_reasoning": 0.6612588167190552, "adv/std_step_conf": 0.9333556294441223, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6741760000000001, "calib/avg_num_step_conf": 6.80078125, "calib/ece": 0.4415200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.88, "calib/gap": 0.06288000000000016, "calib/mean_conf": 0.9415200000000001, "calib/mu_c": 0.9729599999999999, "calib/mu_w": 0.9100799999999998, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4415200000000001, "calib/std_conf": 0.13628532422825282, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6151608751608751, "calib/step_q_c_n": 777.0, "calib/step_q_gap": 0.18636419466294984, "calib/step_q_w": 0.4287966804979253, "calib/step_q_w_n": 964.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3064.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 634.59375, "completions/mean_terminated_length": 637.0823974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.0704, "grad_norm": 0.033459797501564026, "learning_rate": 3.7222222222222225e-06, "loss": 0.0443, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0286815594881773, "mask/share_reasoning": 0.8461880683898926, "mask/share_step_conf": 0.12122409045696259, "num_tokens": 16065548.0, "reward": 1.1990708112716675, "reward_std": 0.2781738042831421, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5546156167984009, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7752785682678223, "step": 66 }, { "adv/mean_abs_final_conf": 0.5760310888290405, "adv/mean_abs_reasoning": 0.3161531090736389, "adv/mean_abs_step_conf": 0.7715531587600708, "adv/ratio_final_to_reasoning": 1.8220003925214299, "adv/ratio_step_to_reasoning": 2.4404414716046943, "adv/std_final_conf": 0.7942258715629578, "adv/std_reasoning": 0.6186121702194214, "adv/std_step_conf": 0.9347555637359619, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7356792144026186, "calib/avg_num_step_conf": 6.1328125, "calib/ece": 0.33548, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.908, "calib/gap": 0.053210583742498985, "calib/mean_conf": 0.95948, "calib/mu_c": 0.9794871794871796, "calib/mu_w": 0.9262765957446806, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.33548, "calib/std_conf": 0.07957970595572718, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6555046826222686, "calib/step_q_c_n": 961.0, "calib/step_q_gap": 0.1265555857421372, "calib/step_q_w": 0.5289490968801314, "calib/step_q_w_n": 609.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 553.71484375, "completions/mean_terminated_length": 560.2806396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 207.0, "epoch": 0.07146666666666666, "grad_norm": 0.02705869823694229, "learning_rate": 3.694444444444445e-06, "loss": -0.078, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.029447246342897415, "mask/share_reasoning": 0.8373123407363892, "mask/share_step_conf": 0.12152165174484253, "num_tokens": 16312307.0, "reward": 1.3032801151275635, "reward_std": 0.25523093342781067, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.655727744102478, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8164318799972534, "step": 67 }, { "adv/mean_abs_final_conf": 0.6699117422103882, "adv/mean_abs_reasoning": 0.3923490047454834, "adv/mean_abs_step_conf": 0.7669743299484253, "adv/ratio_final_to_reasoning": 1.707438362549077, "adv/ratio_step_to_reasoning": 1.9548267503468275, "adv/std_final_conf": 0.8446117639541626, "adv/std_reasoning": 0.6613078713417053, "adv/std_step_conf": 0.9356321692466736, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7924491392801252, "calib/avg_num_step_conf": 5.9375, "calib/ece": 0.3712000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.86, "calib/gap": 0.08661971830985882, "calib/mean_conf": 0.9392, "calib/mu_c": 0.9766197183098588, "calib/mu_w": 0.89, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3712000000000002, "calib/std_conf": 0.11468984261912647, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6703259452411995, "calib/step_q_c_n": 767.0, "calib/step_q_gap": 0.19024626396629907, "calib/step_q_w": 0.4800796812749004, "calib/step_q_w_n": 753.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2676.0, "completions/max_terminated_length": 2676.0, "completions/mean_length": 539.84765625, "completions/mean_terminated_length": 541.9647216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.07253333333333334, "grad_norm": 0.035893820226192474, "learning_rate": 3.6666666666666666e-06, "loss": 0.0214, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03266291320323944, "mask/share_reasoning": 0.8366715908050537, "mask/share_step_conf": 0.12675921618938446, "num_tokens": 16554596.0, "reward": 1.2877839803695679, "reward_std": 0.25560203194618225, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6310445070266724, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8191367387771606, "step": 68 }, { "adv/mean_abs_final_conf": 0.7307136058807373, "adv/mean_abs_reasoning": 0.4282193183898926, "adv/mean_abs_step_conf": 0.7660757303237915, "adv/ratio_final_to_reasoning": 1.706400375929384, "adv/ratio_step_to_reasoning": 1.7889798461317468, "adv/std_final_conf": 0.8997459411621094, "adv/std_reasoning": 0.7013484835624695, "adv/std_step_conf": 0.9359602928161621, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7487881107284092, "calib/avg_num_step_conf": 6.60546875, "calib/ece": 0.38139442231075693, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.601593625498008, "calib/gap": 0.13963324403622912, "calib/mean_conf": 0.8384462151394423, "calib/mu_c": 0.912991452991453, "calib/mu_w": 0.7733582089552239, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3768525896414342, "calib/std_conf": 0.20519637881818759, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6215023474178404, "calib/step_q_c_n": 639.0, "calib/step_q_gap": 0.19641679608704188, "calib/step_q_w": 0.4250855513307985, "calib/step_q_w_n": 1052.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2853.0, "completions/max_terminated_length": 2853.0, "completions/mean_length": 630.01171875, "completions/mean_terminated_length": 632.4823608398438, "completions/min_length": 0.0, "completions/min_terminated_length": 235.0, "epoch": 0.0736, "grad_norm": 0.03961062431335449, "learning_rate": 3.638888888888889e-06, "loss": 0.0818, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.027123235166072845, "mask/share_reasoning": 0.8559575080871582, "mask/share_step_conf": 0.11301306635141373, "num_tokens": 16820375.0, "reward": 1.26645028591156, "reward_std": 0.23711402714252472, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.6274238228797913, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8089882731437683, "step": 69 }, { "adv/mean_abs_final_conf": 0.7056593894958496, "adv/mean_abs_reasoning": 0.41198545694351196, "adv/mean_abs_step_conf": 0.7569766044616699, "adv/ratio_final_to_reasoning": 1.7128259689822105, "adv/ratio_step_to_reasoning": 1.837386712816565, "adv/std_final_conf": 0.8766942024230957, "adv/std_reasoning": 0.6816877722740173, "adv/std_step_conf": 0.9351736307144165, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8069104349547884, "calib/avg_num_step_conf": 6.0234375, "calib/ece": 0.20653225806451614, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5201612903225806, "calib/gap": 0.3008685895320442, "calib/mean_conf": 0.7592741935483871, "calib/mu_c": 0.8915107913669066, "calib/mu_w": 0.5906422018348624, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20266129032258062, "calib/std_conf": 0.28556727185267905, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5986434108527132, "calib/step_q_c_n": 774.0, "calib/step_q_gap": 0.20463299418604652, "calib/step_q_w": 0.3940104166666667, "calib/step_q_w_n": 768.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 609.78125, "completions/mean_terminated_length": 612.172607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.07466666666666667, "grad_norm": 0.03800712525844574, "learning_rate": 3.6111111111111115e-06, "loss": -0.0097, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.030685387551784515, "mask/share_reasoning": 0.8435661792755127, "mask/share_step_conf": 0.12184222042560577, "num_tokens": 17083471.0, "reward": 1.3623967170715332, "reward_std": 0.23922546207904816, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7564241886138916, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8330126404762268, "step": 70 }, { "adv/mean_abs_final_conf": 0.7434986233711243, "adv/mean_abs_reasoning": 0.46663838624954224, "adv/mean_abs_step_conf": 0.7635763883590698, "adv/ratio_final_to_reasoning": 1.5933078916776613, "adv/ratio_step_to_reasoning": 1.6363342812323531, "adv/std_final_conf": 0.9111484885215759, "adv/std_reasoning": 0.7392783164978027, "adv/std_step_conf": 0.9355947971343994, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6251235788433019, "calib/avg_num_step_conf": 6.484375, "calib/ece": 0.2691372549019608, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4666666666666667, "calib/gap": 0.0944432773109245, "calib/mean_conf": 0.7530588235294118, "calib/mu_c": 0.7971323529411766, "calib/mu_w": 0.7026890756302521, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24443137254901962, "calib/std_conf": 0.2693813836795812, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.515, "calib/step_q_c_n": 890.0, "calib/step_q_gap": 0.05715584415584413, "calib/step_q_w": 0.4578441558441559, "calib/step_q_w_n": 770.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2472.0, "completions/max_terminated_length": 2472.0, "completions/mean_length": 563.7109375, "completions/mean_terminated_length": 563.7109375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.07573333333333333, "grad_norm": 0.0358133465051651, "learning_rate": 3.5833333333333335e-06, "loss": -0.0344, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03034607507288456, "mask/share_reasoning": 0.8403059244155884, "mask/share_step_conf": 0.1293479949235916, "num_tokens": 17332189.0, "reward": 1.295250415802002, "reward_std": 0.2442229986190796, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6744769811630249, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8056682348251343, "step": 71 }, { "adv/mean_abs_final_conf": 0.7388667464256287, "adv/mean_abs_reasoning": 0.3799426555633545, "adv/mean_abs_step_conf": 0.7495532035827637, "adv/ratio_final_to_reasoning": 1.9446796394315997, "adv/ratio_step_to_reasoning": 1.972806139577496, "adv/std_final_conf": 0.9300165176391602, "adv/std_reasoning": 0.681516170501709, "adv/std_step_conf": 0.9357799887657166, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.798740157480315, "calib/avg_num_step_conf": 5.90625, "calib/ece": 0.18142857142857138, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.36904761904761907, "calib/gap": 0.29151874015748, "calib/mean_conf": 0.6754761904761906, "calib/mu_c": 0.8200787401574801, "calib/mu_w": 0.52856, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1764682539682539, "calib/std_conf": 0.2909224713798188, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5878830083565459, "calib/step_q_c_n": 718.0, "calib/step_q_gap": 0.16847494790314543, "calib/step_q_w": 0.4194080604534005, "calib/step_q_w_n": 794.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 524.52734375, "completions/mean_terminated_length": 528.657470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 229.0, "epoch": 0.0768, "grad_norm": 0.08097515255212784, "learning_rate": 3.555555555555556e-06, "loss": -0.0603, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.029277801513671875, "mask/share_reasoning": 0.8439998626708984, "mask/share_step_conf": 0.11890986561775208, "num_tokens": 17570876.0, "reward": 1.3871753215789795, "reward_std": 0.23918916285037994, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.7695007920265198, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8539873957633972, "step": 72 }, { "adv/mean_abs_final_conf": 0.740119457244873, "adv/mean_abs_reasoning": 0.476161926984787, "adv/mean_abs_step_conf": 0.7462303638458252, "adv/ratio_final_to_reasoning": 1.5543440483189226, "adv/ratio_step_to_reasoning": 1.5671777216023965, "adv/std_final_conf": 0.9278834462165833, "adv/std_reasoning": 0.7207294702529907, "adv/std_step_conf": 0.9352030158042908, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.788538984617416, "calib/avg_num_step_conf": 6.16796875, "calib/ece": 0.08587301587301585, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.30952380952380953, "calib/gap": 0.3112537136066548, "calib/mean_conf": 0.6396825396825397, "calib/mu_c": 0.7619607843137255, "calib/mu_w": 0.45070707070707067, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.059206349206349165, "calib/std_conf": 0.3082166742189103, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5835434007134364, "calib/step_q_c_n": 841.0, "calib/step_q_gap": 0.22873310261045532, "calib/step_q_w": 0.35481029810298104, "calib/step_q_w_n": 738.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2976.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 553.109375, "completions/mean_terminated_length": 553.109375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.07786666666666667, "grad_norm": 0.0581560842692852, "learning_rate": 3.5277777777777784e-06, "loss": -0.0039, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.029643334448337555, "mask/share_reasoning": 0.8481807708740234, "mask/share_step_conf": 0.1221759170293808, "num_tokens": 17819504.0, "reward": 1.4292724132537842, "reward_std": 0.21470966935157776, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7984195351600647, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8722500801086426, "step": 73 }, { "adv/mean_abs_final_conf": 0.7195020914077759, "adv/mean_abs_reasoning": 0.5106623768806458, "adv/mean_abs_step_conf": 0.7628088593482971, "adv/ratio_final_to_reasoning": 1.4089584899573306, "adv/ratio_step_to_reasoning": 1.4937635782136034, "adv/std_final_conf": 0.9145419001579285, "adv/std_reasoning": 0.7752376198768616, "adv/std_step_conf": 0.935356080532074, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7208216619981327, "calib/avg_num_step_conf": 5.70703125, "calib/ece": 0.16078740157480317, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2204724409448819, "calib/gap": 0.2381313414254589, "calib/mean_conf": 0.5514173228346457, "calib/mu_c": 0.6779831932773107, "calib/mu_w": 0.4398518518518518, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12185039370078743, "calib/std_conf": 0.3114955653921489, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49009921259842515, "calib/step_q_c_n": 635.0, "calib/step_q_gap": 0.09882802615774716, "calib/step_q_w": 0.391271186440678, "calib/step_q_w_n": 826.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1243.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 511.04296875, "completions/mean_terminated_length": 513.047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.07893333333333333, "grad_norm": 0.04147012531757355, "learning_rate": 3.5e-06, "loss": -0.047, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03193596377968788, "mask/share_reasoning": 0.8396960496902466, "mask/share_step_conf": 0.12446170300245285, "num_tokens": 18054259.0, "reward": 1.3750741481781006, "reward_std": 0.20677241683006287, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.7596992254257202, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8495213985443115, "step": 74 }, { "adv/mean_abs_final_conf": 0.7395988702774048, "adv/mean_abs_reasoning": 0.3775707483291626, "adv/mean_abs_step_conf": 0.7631238698959351, "adv/ratio_final_to_reasoning": 1.958835194596774, "adv/ratio_step_to_reasoning": 2.0211413974015033, "adv/std_final_conf": 0.909938633441925, "adv/std_reasoning": 0.6612691283226013, "adv/std_step_conf": 0.9357552528381348, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7381803589668758, "calib/avg_num_step_conf": 5.8515625, "calib/ece": 0.15396078431372548, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4117647058823529, "calib/gap": 0.3014767255216695, "calib/mean_conf": 0.656156862745098, "calib/mu_c": 0.7471910112359551, "calib/mu_w": 0.4457142857142856, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.056039215686274534, "calib/std_conf": 0.3397532768341895, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5519630709426628, "calib/step_q_c_n": 1029.0, "calib/step_q_gap": 0.19177117328807858, "calib/step_q_w": 0.36019189765458426, "calib/step_q_w_n": 469.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2300.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 493.79296875, "completions/mean_terminated_length": 493.79296875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.08, "grad_norm": 0.05103519186377525, "learning_rate": 3.4722222222222224e-06, "loss": 0.0104, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03264331445097923, "mask/share_reasoning": 0.8353633284568787, "mask/share_step_conf": 0.1319933533668518, "num_tokens": 18285422.0, "reward": 1.4125370979309082, "reward_std": 0.2209436148405075, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7917730212211609, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.848291277885437, "step": 75 }, { "adv/mean_abs_final_conf": 0.7181636691093445, "adv/mean_abs_reasoning": 0.3028116524219513, "adv/mean_abs_step_conf": 0.7950438261032104, "adv/ratio_final_to_reasoning": 2.3716513660069563, "adv/ratio_step_to_reasoning": 2.625539076004119, "adv/std_final_conf": 0.8976917266845703, "adv/std_reasoning": 0.6183298826217651, "adv/std_step_conf": 0.9354302883148193, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7375237122428133, "calib/avg_num_step_conf": 5.7578125, "calib/ece": 0.1915294117647059, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.34901960784313724, "calib/gap": 0.29070698963957414, "calib/mean_conf": 0.5864313725490197, "calib/mu_c": 0.6742134831460675, "calib/mu_w": 0.3835064935064934, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03996078431372553, "calib/std_conf": 0.35051588058913247, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4840788177339902, "calib/step_q_c_n": 1015.0, "calib/step_q_gap": 0.09244483080588561, "calib/step_q_w": 0.3916339869281046, "calib/step_q_w_n": 459.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2446.0, "completions/max_terminated_length": 2446.0, "completions/mean_length": 522.375, "completions/mean_terminated_length": 522.375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.08106666666666666, "grad_norm": 0.048353470861911774, "learning_rate": 3.444444444444445e-06, "loss": 0.0597, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03172909840941429, "mask/share_reasoning": 0.8478416204452515, "mask/share_step_conf": 0.12042926251888275, "num_tokens": 18522206.0, "reward": 1.414903163909912, "reward_std": 0.17302893102169037, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7734194993972778, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8590527176856995, "step": 76 }, { "adv/mean_abs_final_conf": 0.744149923324585, "adv/mean_abs_reasoning": 0.46821609139442444, "adv/mean_abs_step_conf": 0.7633011341094971, "adv/ratio_final_to_reasoning": 1.5893300913866164, "adv/ratio_step_to_reasoning": 1.6302325958859314, "adv/std_final_conf": 0.909368097782135, "adv/std_reasoning": 0.7392141819000244, "adv/std_step_conf": 0.9352315664291382, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7154060027090612, "calib/avg_num_step_conf": 6.23046875, "calib/ece": 0.16301587301587295, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.36507936507936506, "calib/gap": 0.2542767519783275, "calib/mean_conf": 0.6429365079365079, "calib/mu_c": 0.7266863905325444, "calib/mu_w": 0.47240963855421686, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06765873015873013, "calib/std_conf": 0.3292308676639026, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4918036072144289, "calib/step_q_c_n": 998.0, "calib/step_q_gap": 0.11434967086602021, "calib/step_q_w": 0.3774539363484087, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 556.34765625, "completions/mean_terminated_length": 558.5294189453125, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.08213333333333334, "grad_norm": 0.045243460685014725, "learning_rate": 3.416666666666667e-06, "loss": 0.022, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03196696564555168, "mask/share_reasoning": 0.8324825763702393, "mask/share_step_conf": 0.13164415955543518, "num_tokens": 18769295.0, "reward": 1.3950183391571045, "reward_std": 0.21794402599334717, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7689140439033508, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8464987874031067, "step": 77 }, { "adv/mean_abs_final_conf": 0.697986364364624, "adv/mean_abs_reasoning": 0.39741477370262146, "adv/mean_abs_step_conf": 0.7532141804695129, "adv/ratio_final_to_reasoning": 1.7563171038198873, "adv/ratio_step_to_reasoning": 1.895284801448096, "adv/std_final_conf": 0.8897503614425659, "adv/std_reasoning": 0.6815274953842163, "adv/std_step_conf": 0.9348727464675903, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.66003300330033, "calib/avg_num_step_conf": 6.09765625, "calib/ece": 0.1886055776892431, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.42231075697211157, "calib/gap": 0.22551155115511556, "calib/mean_conf": 0.652589641434263, "calib/mu_c": 0.7433333333333333, "calib/mu_w": 0.5178217821782177, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12179282868525904, "calib/std_conf": 0.34061706795804225, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5183182844243792, "calib/step_q_c_n": 886.0, "calib/step_q_gap": 0.13551828442437924, "calib/step_q_w": 0.3828, "calib/step_q_w_n": 675.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 587.96484375, "completions/mean_terminated_length": 590.2706298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 218.0, "epoch": 0.0832, "grad_norm": 0.04956922307610512, "learning_rate": 3.3888888888888893e-06, "loss": 0.1075, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.028507526963949203, "mask/share_reasoning": 0.8540570735931396, "mask/share_step_conf": 0.11352914571762085, "num_tokens": 19027838.0, "reward": 1.3692377805709839, "reward_std": 0.23170146346092224, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7343155741691589, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8454394340515137, "step": 78 }, { "adv/mean_abs_final_conf": 0.6280073523521423, "adv/mean_abs_reasoning": 0.3942117691040039, "adv/mean_abs_step_conf": 0.7584797143936157, "adv/ratio_final_to_reasoning": 1.5930710383901723, "adv/ratio_step_to_reasoning": 1.9240412738502182, "adv/std_final_conf": 0.8607851266860962, "adv/std_reasoning": 0.7012507915496826, "adv/std_step_conf": 0.9304596781730652, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6052868625277162, "calib/avg_num_step_conf": 6.18359375, "calib/ece": 0.20690476190476187, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5277777777777778, "calib/gap": 0.131879157427938, "calib/mean_conf": 0.7615079365079365, "calib/mu_c": 0.8075609756097563, "calib/mu_w": 0.6756818181818183, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15880952380952376, "calib/std_conf": 0.2860016301993651, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5340417457305503, "calib/step_q_c_n": 1054.0, "calib/step_q_gap": 0.029372558584992547, "calib/step_q_w": 0.5046691871455578, "calib/step_q_w_n": 529.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 532.50390625, "completions/mean_terminated_length": 538.8181762695312, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.08426666666666667, "grad_norm": 0.04731028527021408, "learning_rate": 3.3611111111111117e-06, "loss": -0.1041, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.028967462480068207, "mask/share_reasoning": 0.8404107689857483, "mask/share_step_conf": 0.11890304088592529, "num_tokens": 19270535.0, "reward": 1.3517359495162964, "reward_std": 0.2281036078929901, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7270851731300354, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8256934285163879, "step": 79 }, { "adv/mean_abs_final_conf": 0.5573588013648987, "adv/mean_abs_reasoning": 0.41972774267196655, "adv/mean_abs_step_conf": 0.7844960689544678, "adv/ratio_final_to_reasoning": 1.3279055556746844, "adv/ratio_step_to_reasoning": 1.8690593668181275, "adv/std_final_conf": 0.8060113191604614, "adv/std_reasoning": 0.7012417316436768, "adv/std_step_conf": 0.9352529048919678, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5498077942735948, "calib/avg_num_step_conf": 5.92578125, "calib/ece": 0.32324218749999994, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.86328125, "calib/gap": 0.03450424178154843, "calib/mean_conf": 0.9175390625000002, "calib/mu_c": 0.9299390243902439, "calib/mu_w": 0.8954347826086955, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3000781249999999, "calib/std_conf": 0.19404966351844335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6749524815205913, "calib/step_q_c_n": 947.0, "calib/step_q_gap": 0.03404020081883685, "calib/step_q_w": 0.6409122807017544, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 478.0703125, "completions/mean_terminated_length": 479.94512939453125, "completions/min_length": 0.0, "completions/min_terminated_length": 218.0, "epoch": 0.08533333333333333, "grad_norm": 0.04117859527468681, "learning_rate": 3.3333333333333333e-06, "loss": -0.0116, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03346114605665207, "mask/share_reasoning": 0.8312935829162598, "mask/share_step_conf": 0.13133902847766876, "num_tokens": 19495081.0, "reward": 1.2891900539398193, "reward_std": 0.23486578464508057, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6713261604309082, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.7894644141197205, "step": 80 }, { "adv/mean_abs_final_conf": 0.5828555822372437, "adv/mean_abs_reasoning": 0.4114669859409332, "adv/mean_abs_step_conf": 0.7615097761154175, "adv/ratio_final_to_reasoning": 1.4165306140038987, "adv/ratio_step_to_reasoning": 1.8507190178915922, "adv/std_final_conf": 0.8089916110038757, "adv/std_reasoning": 0.7014133930206299, "adv/std_step_conf": 0.934392511844635, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7270913685330496, "calib/avg_num_step_conf": 6.40234375, "calib/ece": 0.2655823293172691, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7670682730923695, "calib/gap": 0.2501363213193246, "calib/mean_conf": 0.8478313253012049, "calib/mu_c": 0.9513013698630136, "calib/mu_w": 0.701165048543689, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2635341365461848, "calib/std_conf": 0.28008195097094235, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7073521850899742, "calib/step_q_c_n": 778.0, "calib/step_q_gap": 0.2678283755661647, "calib/step_q_w": 0.43952380952380954, "calib/step_q_w_n": 861.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2411.0, "completions/max_terminated_length": 2411.0, "completions/mean_length": 552.83984375, "completions/mean_terminated_length": 559.395263671875, "completions/min_length": 0.0, "completions/min_terminated_length": 218.0, "epoch": 0.0864, "grad_norm": 0.03954046592116356, "learning_rate": 3.3055555555555558e-06, "loss": -0.0206, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.030756745487451553, "mask/share_reasoning": 0.8393161296844482, "mask/share_step_conf": 0.11820834130048752, "num_tokens": 19742856.0, "reward": 1.2981681823730469, "reward_std": 0.3054763376712799, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7119581699371338, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7875015139579773, "step": 81 }, { "adv/mean_abs_final_conf": 0.4838182330131531, "adv/mean_abs_reasoning": 0.4532151222229004, "adv/mean_abs_step_conf": 0.7782233953475952, "adv/ratio_final_to_reasoning": 1.0675244697047013, "adv/ratio_step_to_reasoning": 1.717117009535373, "adv/std_final_conf": 0.740546464920044, "adv/std_reasoning": 0.7205851078033447, "adv/std_step_conf": 0.9346796274185181, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6594101123595506, "calib/avg_num_step_conf": 6.1953125, "calib/ece": 0.29353413654618477, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8795180722891566, "calib/gap": 0.13055688202247184, "calib/mean_conf": 0.9193975903614459, "calib/mu_c": 0.9660624999999999, "calib/mu_w": 0.8355056179775281, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2851807228915663, "calib/std_conf": 0.2104213561946316, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7590744920993229, "calib/step_q_c_n": 886.0, "calib/step_q_gap": 0.2732459206707515, "calib/step_q_w": 0.4858285714285714, "calib/step_q_w_n": 700.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3005.0, "completions/max_terminated_length": 3005.0, "completions/mean_length": 501.48046875, "completions/mean_terminated_length": 511.4701232910156, "completions/min_length": 0.0, "completions/min_terminated_length": 220.0, "epoch": 0.08746666666666666, "grad_norm": 0.04062870889902115, "learning_rate": 3.277777777777778e-06, "loss": -0.0729, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03170660138130188, "mask/share_reasoning": 0.8233975768089294, "mask/share_step_conf": 0.12536457180976868, "num_tokens": 19976787.0, "reward": 1.303025245666504, "reward_std": 0.2826133072376251, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.689989447593689, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7982649207115173, "step": 82 }, { "adv/mean_abs_final_conf": 0.4387087821960449, "adv/mean_abs_reasoning": 0.36228880286216736, "adv/mean_abs_step_conf": 0.755591630935669, "adv/ratio_final_to_reasoning": 1.2109366304730966, "adv/ratio_step_to_reasoning": 2.085605806655674, "adv/std_final_conf": 0.7206283807754517, "adv/std_reasoning": 0.6611847877502441, "adv/std_step_conf": 0.9289259910583496, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.576388888888889, "calib/avg_num_step_conf": 5.6015625, "calib/ece": 0.37199203187251006, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8884462151394422, "calib/gap": 0.07596962616822422, "calib/mean_conf": 0.9238645418326695, "calib/mu_c": 0.9562499999999999, "calib/mu_w": 0.8802803738317757, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36107569721115546, "calib/std_conf": 0.20589084829598686, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7503230337078652, "calib/step_q_c_n": 712.0, "calib/step_q_gap": 0.16447815836160484, "calib/step_q_w": 0.5858448753462604, "calib/step_q_w_n": 722.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2452.0, "completions/max_terminated_length": 2452.0, "completions/mean_length": 542.39453125, "completions/mean_terminated_length": 551.0040283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.08853333333333334, "grad_norm": 0.06323331594467163, "learning_rate": 3.2500000000000002e-06, "loss": -0.0869, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.031102724373340607, "mask/share_reasoning": 0.8401299715042114, "mask/share_step_conf": 0.11314228922128677, "num_tokens": 20222904.0, "reward": 1.235900640487671, "reward_std": 0.23764778673648834, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6153316497802734, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7739379405975342, "step": 83 }, { "adv/mean_abs_final_conf": 0.43967533111572266, "adv/mean_abs_reasoning": 0.36439767479896545, "adv/mean_abs_step_conf": 0.7603832483291626, "adv/ratio_final_to_reasoning": 1.2065810555961618, "adv/ratio_step_to_reasoning": 2.0866852368052524, "adv/std_final_conf": 0.7023812532424927, "adv/std_reasoning": 0.6402844190597534, "adv/std_step_conf": 0.9353541731834412, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5685534989858012, "calib/avg_num_step_conf": 5.16015625, "calib/ece": 0.42261904761904767, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9246031746031746, "calib/gap": 0.060053245436105684, "calib/mean_conf": 0.9490476190476191, "calib/mu_c": 0.9766911764705883, "calib/mu_w": 0.9166379310344827, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4159920634920636, "calib/std_conf": 0.16705634716840811, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7947251114413076, "calib/step_q_c_n": 673.0, "calib/step_q_gap": 0.1365152348980978, "calib/step_q_w": 0.6582098765432098, "calib/step_q_w_n": 648.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2789.0, "completions/max_terminated_length": 2789.0, "completions/mean_length": 497.1640625, "completions/mean_terminated_length": 497.1640625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.0896, "grad_norm": 0.03995829448103905, "learning_rate": 3.2222222222222227e-06, "loss": 0.0282, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0344880037009716, "mask/share_reasoning": 0.8458722829818726, "mask/share_step_conf": 0.11963975429534912, "num_tokens": 20456098.0, "reward": 1.1661443710327148, "reward_std": 0.2615615427494049, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5767694711685181, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7261971235275269, "step": 84 }, { "adv/mean_abs_final_conf": 0.5147285461425781, "adv/mean_abs_reasoning": 0.47191280126571655, "adv/mean_abs_step_conf": 0.7632074952125549, "adv/ratio_final_to_reasoning": 1.0907280852776733, "adv/ratio_step_to_reasoning": 1.6172638105293125, "adv/std_final_conf": 0.7584975957870483, "adv/std_reasoning": 0.7393365502357483, "adv/std_step_conf": 0.9357002377510071, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6181605351170568, "calib/avg_num_step_conf": 6.11328125, "calib/ece": 0.4077551020408164, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9020408163265307, "calib/gap": 0.09645819397993294, "calib/mean_conf": 0.9338775510204083, "calib/mu_c": 0.9791538461538462, "calib/mu_w": 0.8826956521739132, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40551020408163274, "calib/std_conf": 0.1852920070143077, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7397910863509749, "calib/step_q_c_n": 718.0, "calib/step_q_gap": 0.1751984063037494, "calib/step_q_w": 0.5645926800472255, "calib/step_q_w_n": 847.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2646.0, "completions/max_terminated_length": 2646.0, "completions/mean_length": 568.9453125, "completions/mean_terminated_length": 577.9761962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.09066666666666667, "grad_norm": 0.04487134516239166, "learning_rate": 3.1944444444444443e-06, "loss": -0.0152, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.030954696238040924, "mask/share_reasoning": 0.8301589488983154, "mask/share_step_conf": 0.12326133251190186, "num_tokens": 20709572.0, "reward": 1.1941413879394531, "reward_std": 0.30876463651657104, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5761609077453613, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7595765590667725, "step": 85 }, { "adv/mean_abs_final_conf": 0.3884989321231842, "adv/mean_abs_reasoning": 0.24427273869514465, "adv/mean_abs_step_conf": 0.7575057148933411, "adv/ratio_final_to_reasoning": 1.5904309838194248, "adv/ratio_step_to_reasoning": 3.1010653048710335, "adv/std_final_conf": 0.6616693139076233, "adv/std_reasoning": 0.5481932163238525, "adv/std_step_conf": 0.9352168440818787, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6341478696741853, "calib/avg_num_step_conf": 5.7734375, "calib/ece": 0.39470355731225304, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8853754940711462, "calib/gap": 0.12376441102756863, "calib/mean_conf": 0.9203952569169962, "calib/mu_c": 0.979097744360902, "calib/mu_w": 0.8553333333333334, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.39470355731225304, "calib/std_conf": 0.21292393633701728, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7278800000000001, "calib/step_q_c_n": 750.0, "calib/step_q_gap": 0.155640989010989, "calib/step_q_w": 0.5722390109890111, "calib/step_q_w_n": 728.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2666.0, "completions/max_terminated_length": 2666.0, "completions/mean_length": 534.9140625, "completions/mean_terminated_length": 537.0117797851562, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.09173333333333333, "grad_norm": 0.04348112642765045, "learning_rate": 3.1666666666666667e-06, "loss": 0.0043, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03243178874254227, "mask/share_reasoning": 0.8390860557556152, "mask/share_step_conf": 0.1245758980512619, "num_tokens": 20952022.0, "reward": 1.2344986200332642, "reward_std": 0.21900911629199982, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6001824140548706, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7840167880058289, "step": 86 }, { "adv/mean_abs_final_conf": 0.4687391519546509, "adv/mean_abs_reasoning": 0.41405802965164185, "adv/mean_abs_step_conf": 0.775758683681488, "adv/ratio_final_to_reasoning": 1.132061494735445, "adv/ratio_step_to_reasoning": 1.8735506333113614, "adv/std_final_conf": 0.7078585028648376, "adv/std_reasoning": 0.681672990322113, "adv/std_step_conf": 0.9350662231445312, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5451519093900666, "calib/avg_num_step_conf": 5.2265625, "calib/ece": 0.269484126984127, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9603174603174603, "calib/gap": 0.037394199127573136, "calib/mean_conf": 0.9661507936507937, "calib/mu_c": 0.9769832402234636, "calib/mu_w": 0.9395890410958905, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26265873015873015, "calib/std_conf": 0.1309302229930924, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7454516129032258, "calib/step_q_c_n": 930.0, "calib/step_q_gap": 0.06878494623655917, "calib/step_q_w": 0.6766666666666666, "calib/step_q_w_n": 408.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 489.05859375, "completions/mean_terminated_length": 489.05859375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.0928, "grad_norm": 0.06873677670955658, "learning_rate": 3.138888888888889e-06, "loss": 0.0182, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03524698317050934, "mask/share_reasoning": 0.8414642214775085, "mask/share_step_conf": 0.12328878790140152, "num_tokens": 21182717.0, "reward": 1.326061487197876, "reward_std": 0.2892857789993286, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7155144810676575, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8003355264663696, "step": 87 }, { "adv/mean_abs_final_conf": 0.4815911054611206, "adv/mean_abs_reasoning": 0.32384055852890015, "adv/mean_abs_step_conf": 0.7832763195037842, "adv/ratio_final_to_reasoning": 1.487124119501364, "adv/ratio_step_to_reasoning": 2.4187097597099876, "adv/std_final_conf": 0.737276554107666, "adv/std_reasoning": 0.6186534762382507, "adv/std_step_conf": 0.9355406761169434, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6192062367115521, "calib/avg_num_step_conf": 5.83984375, "calib/ece": 0.28964143426294814, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9322709163346613, "calib/gap": 0.11223033309709418, "calib/mean_conf": 0.9392828685258965, "calib/mu_c": 0.9772891566265058, "calib/mu_w": 0.8650588235294117, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28378486055776886, "calib/std_conf": 0.19620372082373552, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6254573804573805, "calib/step_q_c_n": 962.0, "calib/step_q_gap": 0.09386263374068249, "calib/step_q_w": 0.531594746716698, "calib/step_q_w_n": 533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2839.0, "completions/max_terminated_length": 2839.0, "completions/mean_length": 555.05078125, "completions/mean_terminated_length": 557.2274780273438, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.09386666666666667, "grad_norm": 0.06078055128455162, "learning_rate": 3.1111111111111116e-06, "loss": 0.0042, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.029890595003962517, "mask/share_reasoning": 0.8496705889701843, "mask/share_step_conf": 0.1165325790643692, "num_tokens": 21434658.0, "reward": 1.3150005340576172, "reward_std": 0.25842803716659546, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6966882944107056, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8037657737731934, "step": 88 }, { "adv/mean_abs_final_conf": 0.5294369459152222, "adv/mean_abs_reasoning": 0.3573133647441864, "adv/mean_abs_step_conf": 0.7569603323936462, "adv/ratio_final_to_reasoning": 1.4817160457859315, "adv/ratio_step_to_reasoning": 2.118477524442954, "adv/std_final_conf": 0.7610372304916382, "adv/std_reasoning": 0.6402999758720398, "adv/std_step_conf": 0.9327251315116882, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6629285077099528, "calib/avg_num_step_conf": 6.40625, "calib/ece": 0.34573705179282876, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8207171314741036, "calib/gap": 0.19132662163884306, "calib/mean_conf": 0.8756175298804783, "calib/mu_c": 0.9655639097744362, "calib/mu_w": 0.7742372881355931, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.34573705179282876, "calib/std_conf": 0.27125633106510616, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6052157829839705, "calib/step_q_c_n": 811.0, "calib/step_q_gap": 0.14473327393692587, "calib/step_q_w": 0.46048250904704463, "calib/step_q_w_n": 829.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2479.0, "completions/max_terminated_length": 2479.0, "completions/mean_length": 568.96484375, "completions/mean_terminated_length": 577.99609375, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.09493333333333333, "grad_norm": 0.04653610289096832, "learning_rate": 3.0833333333333336e-06, "loss": -0.0857, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.029766708612442017, "mask/share_reasoning": 0.8333158493041992, "mask/share_step_conf": 0.12129238992929459, "num_tokens": 21689201.0, "reward": 1.269324541091919, "reward_std": 0.27564674615859985, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6335089802742004, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8033512830734253, "step": 89 }, { "adv/mean_abs_final_conf": 0.4718265235424042, "adv/mean_abs_reasoning": 0.4269935190677643, "adv/mean_abs_step_conf": 0.7298752665519714, "adv/ratio_final_to_reasoning": 1.1049969202636185, "adv/ratio_step_to_reasoning": 1.7093357017349473, "adv/std_final_conf": 0.7413638830184937, "adv/std_reasoning": 0.7205032110214233, "adv/std_step_conf": 0.9347279667854309, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.616210241049779, "calib/avg_num_step_conf": 6.6328125, "calib/ece": 0.26588932806324106, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8616600790513834, "calib/gap": 0.15416773641420622, "calib/mean_conf": 0.888102766798419, "calib/mu_c": 0.9380701754385965, "calib/mu_w": 0.7839024390243903, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23905138339920942, "calib/std_conf": 0.27309591857684906, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5750236518448438, "calib/step_q_c_n": 1057.0, "calib/step_q_gap": 0.13070227898992964, "calib/step_q_w": 0.4443213728549142, "calib/step_q_w_n": 641.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2454.0, "completions/max_terminated_length": 2454.0, "completions/mean_length": 536.16015625, "completions/mean_terminated_length": 540.3818969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.096, "grad_norm": 0.053866349160671234, "learning_rate": 3.055555555555556e-06, "loss": -0.0463, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030385639518499374, "mask/share_reasoning": 0.8322639465332031, "mask/share_step_conf": 0.12953796982765198, "num_tokens": 21929778.0, "reward": 1.369779109954834, "reward_std": 0.23515094816684723, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7164187431335449, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8463352918624878, "step": 90 }, { "adv/mean_abs_final_conf": 0.5278657674789429, "adv/mean_abs_reasoning": 0.41073352098464966, "adv/mean_abs_step_conf": 0.7537205219268799, "adv/ratio_final_to_reasoning": 1.2851782007309571, "adv/ratio_step_to_reasoning": 1.835059675967009, "adv/std_final_conf": 0.7727110385894775, "adv/std_reasoning": 0.7014096975326538, "adv/std_step_conf": 0.9352238178253174, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6680456171735242, "calib/avg_num_step_conf": 6.7734375, "calib/ece": 0.24536000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.836, "calib/gap": 0.20369409660107352, "calib/mean_conf": 0.87168, "calib/mu_c": 0.935232558139535, "calib/mu_w": 0.7315384615384615, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21452000000000007, "calib/std_conf": 0.29014544214927795, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5232943469785575, "calib/step_q_c_n": 1026.0, "calib/step_q_gap": 0.1492971718373145, "calib/step_q_w": 0.373997175141243, "calib/step_q_w_n": 708.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 541.12109375, "completions/mean_terminated_length": 547.53759765625, "completions/min_length": 0.0, "completions/min_terminated_length": 225.0, "epoch": 0.09706666666666666, "grad_norm": 0.07357340306043625, "learning_rate": 3.0277777777777776e-06, "loss": -0.0439, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.028301572427153587, "mask/share_reasoning": 0.8381121158599854, "mask/share_step_conf": 0.1218675896525383, "num_tokens": 22176017.0, "reward": 1.365419626235962, "reward_std": 0.2590673863887787, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7370996475219727, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8324166536331177, "step": 91 }, { "adv/mean_abs_final_conf": 0.5962716341018677, "adv/mean_abs_reasoning": 0.4923080503940582, "adv/mean_abs_step_conf": 0.7713586091995239, "adv/ratio_final_to_reasoning": 1.211175875804984, "adv/ratio_step_to_reasoning": 1.5668210352889926, "adv/std_final_conf": 0.7960085272789001, "adv/std_reasoning": 0.7395338416099548, "adv/std_step_conf": 0.9347488880157471, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6264086232239099, "calib/avg_num_step_conf": 6.03515625, "calib/ece": 0.28157258064516133, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.7903225806451613, "calib/gap": 0.20725484706376407, "calib/mean_conf": 0.8207661290322581, "calib/mu_c": 0.8968152866242037, "calib/mu_w": 0.6895604395604397, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2346370967741936, "calib/std_conf": 0.3455614554575189, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5519620253164557, "calib/step_q_c_n": 948.0, "calib/step_q_gap": 0.14090674893454608, "calib/step_q_w": 0.4110552763819096, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2518.0, "completions/max_terminated_length": 2518.0, "completions/mean_length": 513.265625, "completions/mean_terminated_length": 523.4900512695312, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.09813333333333334, "grad_norm": 0.053371768444776535, "learning_rate": 3e-06, "loss": -0.0806, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03073771856725216, "mask/share_reasoning": 0.8260927200317383, "mask/share_step_conf": 0.12363831698894501, "num_tokens": 22414133.0, "reward": 1.3340181112289429, "reward_std": 0.2972875237464905, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6785469055175781, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.837713360786438, "step": 92 }, { "adv/mean_abs_final_conf": 0.5813256502151489, "adv/mean_abs_reasoning": 0.4211743474006653, "adv/mean_abs_step_conf": 0.7620203495025635, "adv/ratio_final_to_reasoning": 1.3802494235531655, "adv/ratio_step_to_reasoning": 1.8092753136687352, "adv/std_final_conf": 0.7933601140975952, "adv/std_reasoning": 0.6817123889923096, "adv/std_step_conf": 0.9354146718978882, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6722418995146269, "calib/avg_num_step_conf": 7.109375, "calib/ece": 0.29296442687747026, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5849802371541502, "calib/gap": 0.3127922077922078, "calib/mean_conf": 0.6094861660079053, "calib/mu_c": 0.7318831168831169, "calib/mu_w": 0.4190909090909091, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14687747035573115, "calib/std_conf": 0.4599906907529452, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46842459173871276, "calib/step_q_c_n": 1041.0, "calib/step_q_gap": 0.11452215271432242, "calib/step_q_w": 0.35390243902439034, "calib/step_q_w_n": 779.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2351.0, "completions/max_terminated_length": 2351.0, "completions/mean_length": 594.62109375, "completions/mean_terminated_length": 596.9530029296875, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.0992, "grad_norm": 0.07864338159561157, "learning_rate": 2.9722222222222225e-06, "loss": -0.0145, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.028944937512278557, "mask/share_reasoning": 0.8290570974349976, "mask/share_step_conf": 0.13809169828891754, "num_tokens": 22672132.0, "reward": 1.3323159217834473, "reward_std": 0.2134857177734375, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6910336017608643, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8278146386146545, "step": 93 }, { "adv/mean_abs_final_conf": 0.706995964050293, "adv/mean_abs_reasoning": 0.42493805289268494, "adv/mean_abs_step_conf": 0.7471293807029724, "adv/ratio_final_to_reasoning": 1.6637624219284493, "adv/ratio_step_to_reasoning": 1.7582077566766057, "adv/std_final_conf": 0.8768224716186523, "adv/std_reasoning": 0.7206136584281921, "adv/std_step_conf": 0.9347711801528931, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.68680105420614, "calib/avg_num_step_conf": 7.77734375, "calib/ece": 0.28625000000000006, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.5125, "calib/gap": 0.33865659947289695, "calib/mean_conf": 0.5400000000000001, "calib/mu_c": 0.6825179856115108, "calib/mu_w": 0.34386138613861383, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12354166666666673, "calib/std_conf": 0.4673025429704685, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45993839835728956, "calib/step_q_c_n": 974.0, "calib/step_q_gap": 0.15902394408000342, "calib/step_q_w": 0.30091445427728614, "calib/step_q_w_n": 1017.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2980.0, "completions/max_terminated_length": 2980.0, "completions/mean_length": 567.58203125, "completions/mean_terminated_length": 585.89111328125, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.10026666666666667, "grad_norm": 0.0639803558588028, "learning_rate": 2.944444444444445e-06, "loss": -0.1048, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.02839193120598793, "mask/share_reasoning": 0.8084524869918823, "mask/share_step_conf": 0.13190564513206482, "num_tokens": 22926113.0, "reward": 1.272699236869812, "reward_std": 0.3097056448459625, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6576046943664551, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7954593896865845, "step": 94 }, { "adv/mean_abs_final_conf": 0.6146141886711121, "adv/mean_abs_reasoning": 0.4706389009952545, "adv/mean_abs_step_conf": 0.7723755836486816, "adv/ratio_final_to_reasoning": 1.3059145501389595, "adv/ratio_step_to_reasoning": 1.6411214245472445, "adv/std_final_conf": 0.8057188987731934, "adv/std_reasoning": 0.7392786145210266, "adv/std_step_conf": 0.9354429841041565, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7381290584415585, "calib/avg_num_step_conf": 7.19140625, "calib/ece": 0.26336000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.576, "calib/gap": 0.3906114718614719, "calib/mean_conf": 0.5952000000000001, "calib/mu_c": 0.7451948051948052, "calib/mu_w": 0.3545833333333333, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.12128000000000003, "calib/std_conf": 0.46847855874095246, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4801834862385321, "calib/step_q_c_n": 1090.0, "calib/step_q_gap": 0.1356428737218876, "calib/step_q_w": 0.3445406125166445, "calib/step_q_w_n": 751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2733.0, "completions/max_terminated_length": 2733.0, "completions/mean_length": 578.84765625, "completions/mean_terminated_length": 585.7114868164062, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.10133333333333333, "grad_norm": 0.06708774715662003, "learning_rate": 2.916666666666667e-06, "loss": -0.1013, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.028501030057668686, "mask/share_reasoning": 0.8217149972915649, "mask/share_step_conf": 0.13806524872779846, "num_tokens": 23180426.0, "reward": 1.346002459526062, "reward_std": 0.26025962829589844, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6996574401855469, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8395332098007202, "step": 95 }, { "adv/mean_abs_final_conf": 0.5579100251197815, "adv/mean_abs_reasoning": 0.30279800295829773, "adv/mean_abs_step_conf": 0.7457443475723267, "adv/ratio_final_to_reasoning": 1.8425155373188462, "adv/ratio_step_to_reasoning": 2.462844339416046, "adv/std_final_conf": 0.7927126884460449, "adv/std_reasoning": 0.5959977507591248, "adv/std_step_conf": 0.9352127909660339, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7185032894736842, "calib/avg_num_step_conf": 7.33203125, "calib/ece": 0.2919685039370078, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5787401574803149, "calib/gap": 0.33855921052631566, "calib/mean_conf": 0.6063779527559054, "calib/mu_c": 0.6916842105263157, "calib/mu_w": 0.353125, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07515748031496058, "calib/std_conf": 0.46109877209135486, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4926863468634687, "calib/step_q_c_n": 1355.0, "calib/step_q_gap": 0.15048328172936903, "calib/step_q_w": 0.34220306513409965, "calib/step_q_w_n": 522.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2968.0, "completions/max_terminated_length": 2968.0, "completions/mean_length": 533.7578125, "completions/mean_terminated_length": 535.8510131835938, "completions/min_length": 0.0, "completions/min_terminated_length": 216.0, "epoch": 0.1024, "grad_norm": 0.06763053685426712, "learning_rate": 2.888888888888889e-06, "loss": 0.0039, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.029817262664437294, "mask/share_reasoning": 0.8207234144210815, "mask/share_step_conf": 0.1455530822277069, "num_tokens": 23422884.0, "reward": 1.3436824083328247, "reward_std": 0.21160925924777985, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.7009460926055908, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8197718262672424, "step": 96 }, { "adv/mean_abs_final_conf": 0.6728768348693848, "adv/mean_abs_reasoning": 0.43920740485191345, "adv/mean_abs_step_conf": 0.7557384967803955, "adv/ratio_final_to_reasoning": 1.5320252514783013, "adv/ratio_step_to_reasoning": 1.7206870568022552, "adv/std_final_conf": 0.8519013524055481, "adv/std_reasoning": 0.7014325857162476, "adv/std_step_conf": 0.9350149035453796, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.682139522475657, "calib/avg_num_step_conf": 7.40625, "calib/ece": 0.30369477911646575, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5461847389558233, "calib/gap": 0.3056162464985993, "calib/mean_conf": 0.5771887550200803, "calib/mu_c": 0.7023809523809523, "calib/mu_w": 0.396764705882353, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1452610441767067, "calib/std_conf": 0.4634480173365173, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5188142292490118, "calib/step_q_c_n": 1012.0, "calib/step_q_gap": 0.14113323377389864, "calib/step_q_w": 0.37768099547511313, "calib/step_q_w_n": 884.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2542.0, "completions/max_terminated_length": 2542.0, "completions/mean_length": 551.26171875, "completions/mean_terminated_length": 555.6023559570312, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.10346666666666667, "grad_norm": 0.07822365313768387, "learning_rate": 2.861111111111111e-06, "loss": -0.0378, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.029504206031560898, "mask/share_reasoning": 0.8202545642852783, "mask/share_step_conf": 0.14242875576019287, "num_tokens": 23669079.0, "reward": 1.3232464790344238, "reward_std": 0.2652709484100342, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6721296906471252, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8324941992759705, "step": 97 }, { "adv/mean_abs_final_conf": 0.6271836161613464, "adv/mean_abs_reasoning": 0.420247346162796, "adv/mean_abs_step_conf": 0.7645853757858276, "adv/ratio_final_to_reasoning": 1.4924154117522663, "adv/ratio_step_to_reasoning": 1.8193699086195811, "adv/std_final_conf": 0.8403021097183228, "adv/std_reasoning": 0.7013960480690002, "adv/std_step_conf": 0.9347314834594727, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6450785612258148, "calib/avg_num_step_conf": 7.28125, "calib/ece": 0.3208695652173913, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5573122529644269, "calib/gap": 0.27845799246851055, "calib/mean_conf": 0.5872727272727272, "calib/mu_c": 0.6995364238410595, "calib/mu_w": 0.42107843137254897, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15565217391304345, "calib/std_conf": 0.46653807923338414, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5182462686567164, "calib/step_q_c_n": 1072.0, "calib/step_q_gap": 0.10553162219206996, "calib/step_q_w": 0.41271464646464645, "calib/step_q_w_n": 792.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2442.0, "completions/max_terminated_length": 2442.0, "completions/mean_length": 572.4453125, "completions/mean_terminated_length": 576.9527587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.10453333333333334, "grad_norm": 0.07259117811918259, "learning_rate": 2.8333333333333335e-06, "loss": -0.0795, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.029141124337911606, "mask/share_reasoning": 0.8251111507415771, "mask/share_step_conf": 0.13793519139289856, "num_tokens": 23921809.0, "reward": 1.3164103031158447, "reward_std": 0.23620383441448212, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6614941358566284, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8286319375038147, "step": 98 }, { "adv/mean_abs_final_conf": 0.6719821691513062, "adv/mean_abs_reasoning": 0.5113678574562073, "adv/mean_abs_step_conf": 0.7439428567886353, "adv/ratio_final_to_reasoning": 1.314087616875399, "adv/ratio_step_to_reasoning": 1.4548095777653474, "adv/std_final_conf": 0.8572866320610046, "adv/std_reasoning": 0.7576091289520264, "adv/std_step_conf": 0.935447096824646, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6780647270349789, "calib/avg_num_step_conf": 8.0859375, "calib/ece": 0.2859677419354839, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.3709677419354839, "calib/gap": 0.33985616214449177, "calib/mean_conf": 0.4055645161290322, "calib/mu_c": 0.5878260869565218, "calib/mu_w": 0.24796992481203006, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11391129032258067, "calib/std_conf": 0.4630958279208193, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5245192307692308, "calib/step_q_c_n": 832.0, "calib/step_q_gap": 0.1616597800422518, "calib/step_q_w": 0.362859450726979, "calib/step_q_w_n": 1238.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2284.0, "completions/max_terminated_length": 2284.0, "completions/mean_length": 658.0078125, "completions/mean_terminated_length": 671.1155395507812, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.1056, "grad_norm": 0.06929034739732742, "learning_rate": 2.805555555555556e-06, "loss": -0.1218, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.025643540546298027, "mask/share_reasoning": 0.8259391784667969, "mask/share_step_conf": 0.12888604402542114, "num_tokens": 24196059.0, "reward": 1.3131260871887207, "reward_std": 0.26187536120414734, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.6805578470230103, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8310502767562866, "step": 99 }, { "adv/mean_abs_final_conf": 0.5885224342346191, "adv/mean_abs_reasoning": 0.3973303437232971, "adv/mean_abs_step_conf": 0.7406108975410461, "adv/ratio_final_to_reasoning": 1.4811917678365616, "adv/ratio_step_to_reasoning": 1.8639676260336446, "adv/std_final_conf": 0.8200313448905945, "adv/std_reasoning": 0.7013905644416809, "adv/std_step_conf": 0.9343773126602173, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7538900067069081, "calib/avg_num_step_conf": 7.73046875, "calib/ece": 0.2663967611336032, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5182186234817814, "calib/gap": 0.42807444668008054, "calib/mean_conf": 0.5378137651821862, "calib/mu_c": 0.7197887323943662, "calib/mu_w": 0.2917142857142857, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11465587044534414, "calib/std_conf": 0.4797149095727484, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5245932539682541, "calib/step_q_c_n": 1008.0, "calib/step_q_gap": 0.14648820762427872, "calib/step_q_w": 0.37810504634397535, "calib/step_q_w_n": 971.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 620.04296875, "completions/mean_terminated_length": 634.9240112304688, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.10666666666666667, "grad_norm": 0.061593785881996155, "learning_rate": 2.7777777777777783e-06, "loss": -0.1101, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.026141732931137085, "mask/share_reasoning": 0.8237094283103943, "mask/share_step_conf": 0.12671136856079102, "num_tokens": 24462198.0, "reward": 1.3486356735229492, "reward_std": 0.2595185339450836, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7075608968734741, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8425114154815674, "step": 100 }, { "adv/mean_abs_final_conf": 0.6654077172279358, "adv/mean_abs_reasoning": 0.5122069120407104, "adv/mean_abs_step_conf": 0.7619589567184448, "adv/ratio_final_to_reasoning": 1.299099449042673, "adv/ratio_step_to_reasoning": 1.4875999109084337, "adv/std_final_conf": 0.8433419466018677, "adv/std_reasoning": 0.7394994497299194, "adv/std_step_conf": 0.9344830513000488, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5954233409610984, "calib/avg_num_step_conf": 8.53125, "calib/ece": 0.3698387096774194, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.40725806451612906, "calib/gap": 0.21982085648904887, "calib/mean_conf": 0.43032258064516127, "calib/mu_c": 0.5322556390977444, "calib/mu_w": 0.31243478260869556, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13193548387096776, "calib/std_conf": 0.4750431822202961, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5072187776793623, "calib/step_q_c_n": 1129.0, "calib/step_q_gap": 0.07490598147083144, "calib/step_q_w": 0.43231279620853086, "calib/step_q_w_n": 1055.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2630.0, "completions/max_terminated_length": 2630.0, "completions/mean_length": 677.53515625, "completions/mean_terminated_length": 682.8700561523438, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.10773333333333333, "grad_norm": 0.06848599761724472, "learning_rate": 2.7500000000000004e-06, "loss": -0.0228, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.024894852191209793, "mask/share_reasoning": 0.8335383534431458, "mask/share_step_conf": 0.13375428318977356, "num_tokens": 24742639.0, "reward": 1.250705361366272, "reward_std": 0.2775850296020508, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6042609214782715, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7997466921806335, "step": 101 }, { "adv/mean_abs_final_conf": 0.5570687055587769, "adv/mean_abs_reasoning": 0.36170411109924316, "adv/mean_abs_step_conf": 0.7299972772598267, "adv/ratio_final_to_reasoning": 1.5401226816742766, "adv/ratio_step_to_reasoning": 2.0182166993936446, "adv/std_final_conf": 0.7902530431747437, "adv/std_reasoning": 0.681429922580719, "adv/std_step_conf": 0.9347419738769531, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7456252095206168, "calib/avg_num_step_conf": 7.15234375, "calib/ece": 0.2704761904761904, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5079365079365079, "calib/gap": 0.4434837412001341, "calib/mean_conf": 0.5265079365079364, "calib/mu_c": 0.6936942675159236, "calib/mu_w": 0.25021052631578944, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.08698412698412697, "calib/std_conf": 0.4850132013590634, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5594901610017889, "calib/step_q_c_n": 1118.0, "calib/step_q_gap": 0.11871877250249024, "calib/step_q_w": 0.4407713884992987, "calib/step_q_w_n": 713.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2903.0, "completions/max_terminated_length": 2903.0, "completions/mean_length": 553.609375, "completions/mean_terminated_length": 555.7803955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.1088, "grad_norm": 0.060042575001716614, "learning_rate": 2.7222222222222224e-06, "loss": -0.0549, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03207315504550934, "mask/share_reasoning": 0.8154199719429016, "mask/share_step_conf": 0.14860066771507263, "num_tokens": 24991059.0, "reward": 1.3538480997085571, "reward_std": 0.22371500730514526, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7175117135047913, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8353265523910522, "step": 102 }, { "adv/mean_abs_final_conf": 0.504610002040863, "adv/mean_abs_reasoning": 0.46583327651023865, "adv/mean_abs_step_conf": 0.7451016306877136, "adv/ratio_final_to_reasoning": 1.0832416392000113, "adv/ratio_step_to_reasoning": 1.5995028012373798, "adv/std_final_conf": 0.747572124004364, "adv/std_reasoning": 0.7575631141662598, "adv/std_step_conf": 0.9354161620140076, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.703758865248227, "calib/avg_num_step_conf": 7.3671875, "calib/ece": 0.2804098360655738, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6270491803278688, "calib/gap": 0.36996028368794326, "calib/mean_conf": 0.6375409836065574, "calib/mu_c": 0.7800666666666666, "calib/mu_w": 0.4101063829787233, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.15159836065573773, "calib/std_conf": 0.4707289629044141, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5817972831765935, "calib/step_q_c_n": 957.0, "calib/step_q_gap": 0.12517726164806825, "calib/step_q_w": 0.45662002152852527, "calib/step_q_w_n": 929.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2726.0, "completions/max_terminated_length": 2726.0, "completions/mean_length": 634.44921875, "completions/mean_terminated_length": 647.087646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.10986666666666667, "grad_norm": 0.05146903917193413, "learning_rate": 2.6944444444444444e-06, "loss": -0.1129, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.028226887807250023, "mask/share_reasoning": 0.8312579393386841, "mask/share_step_conf": 0.12098385393619537, "num_tokens": 25258030.0, "reward": 1.3042078018188477, "reward_std": 0.28497809171676636, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6827234029769897, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8089398145675659, "step": 103 }, { "adv/mean_abs_final_conf": 0.6268337965011597, "adv/mean_abs_reasoning": 0.4484860301017761, "adv/mean_abs_step_conf": 0.7349182367324829, "adv/ratio_final_to_reasoning": 1.39766626924569, "adv/ratio_step_to_reasoning": 1.638664723995318, "adv/std_final_conf": 0.8485493659973145, "adv/std_reasoning": 0.72063809633255, "adv/std_step_conf": 0.935498833656311, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7556918795851252, "calib/avg_num_step_conf": 7.4140625, "calib/ece": 0.22392857142857148, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.43253968253968256, "calib/gap": 0.4920882873766758, "calib/mean_conf": 0.4551984126984126, "calib/mu_c": 0.7168644067796609, "calib/mu_w": 0.2247761194029851, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.10543650793650798, "calib/std_conf": 0.4773425531139598, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5774069319640566, "calib/step_q_c_n": 779.0, "calib/step_q_gap": 0.13528896949756863, "calib/step_q_w": 0.44211796246648793, "calib/step_q_w_n": 1119.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2481.0, "completions/max_terminated_length": 2481.0, "completions/mean_length": 598.61328125, "completions/mean_terminated_length": 605.7114868164062, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.11093333333333333, "grad_norm": 0.08386190980672836, "learning_rate": 2.666666666666667e-06, "loss": -0.0847, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02749706618487835, "mask/share_reasoning": 0.830479621887207, "mask/share_step_conf": 0.1303045153617859, "num_tokens": 25517955.0, "reward": 1.3406147956848145, "reward_std": 0.26186612248420715, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7465121150016785, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8243899345397949, "step": 104 }, { "adv/mean_abs_final_conf": 0.7227871417999268, "adv/mean_abs_reasoning": 0.6233465671539307, "adv/mean_abs_step_conf": 0.7583866715431213, "adv/ratio_final_to_reasoning": 1.1595269467834255, "adv/ratio_step_to_reasoning": 1.2166372793320341, "adv/std_final_conf": 0.8908979892730713, "adv/std_reasoning": 0.842984139919281, "adv/std_step_conf": 0.9357901811599731, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5856815203145479, "calib/avg_num_step_conf": 6.73046875, "calib/ece": 0.4117269076305221, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5100401606425703, "calib/gap": 0.14942857142857147, "calib/mean_conf": 0.5240160642570281, "calib/mu_c": 0.5894285714285714, "calib/mu_w": 0.43999999999999995, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.18674698795180725, "calib/std_conf": 0.4868940325560858, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.5554122621564482, "calib/step_q_c_n": 946.0, "calib/step_q_gap": 0.08958214632633232, "calib/step_q_w": 0.4658301158301159, "calib/step_q_w_n": 777.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3006.0, "completions/max_terminated_length": 3006.0, "completions/mean_length": 615.9921875, "completions/mean_terminated_length": 623.2964477539062, "completions/min_length": 0.0, "completions/min_terminated_length": 232.0, "epoch": 0.112, "grad_norm": 0.06290662288665771, "learning_rate": 2.6388888888888893e-06, "loss": -0.0662, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.027417458593845367, "mask/share_reasoning": 0.8350158333778381, "mask/share_step_conf": 0.12584799528121948, "num_tokens": 25781409.0, "reward": 1.20177161693573, "reward_std": 0.345975786447525, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5574495792388916, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7734373807907104, "step": 105 }, { "adv/mean_abs_final_conf": 0.5670008659362793, "adv/mean_abs_reasoning": 0.4488065838813782, "adv/mean_abs_step_conf": 0.768217921257019, "adv/ratio_final_to_reasoning": 1.263352380067001, "adv/ratio_step_to_reasoning": 1.711690400379828, "adv/std_final_conf": 0.776646077632904, "adv/std_reasoning": 0.7206236720085144, "adv/std_step_conf": 0.9351850748062134, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.766190103146625, "calib/avg_num_step_conf": 7.46484375, "calib/ece": 0.23493975903614456, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5100401606425703, "calib/gap": 0.4839992166079122, "calib/mean_conf": 0.5307630522088354, "calib/mu_c": 0.7465217391304347, "calib/mu_w": 0.26252252252252256, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10574297188755019, "calib/std_conf": 0.48095435171547984, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6151315789473685, "calib/step_q_c_n": 988.0, "calib/step_q_gap": 0.19577079888236304, "calib/step_q_w": 0.41936078006500543, "calib/step_q_w_n": 923.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2453.0, "completions/max_terminated_length": 2453.0, "completions/mean_length": 600.07421875, "completions/mean_terminated_length": 607.1897583007812, "completions/min_length": 0.0, "completions/min_terminated_length": 259.0, "epoch": 0.11306666666666666, "grad_norm": 0.0558047778904438, "learning_rate": 2.6111111111111113e-06, "loss": -0.0252, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02651432901620865, "mask/share_reasoning": 0.8316283226013184, "mask/share_step_conf": 0.13013863563537598, "num_tokens": 26039612.0, "reward": 1.343740701675415, "reward_std": 0.2567639648914337, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7355890274047852, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8255555629730225, "step": 106 }, { "adv/mean_abs_final_conf": 0.5723678469657898, "adv/mean_abs_reasoning": 0.4326947331428528, "adv/mean_abs_step_conf": 0.7519802451133728, "adv/ratio_final_to_reasoning": 1.3227982758385561, "adv/ratio_step_to_reasoning": 1.7379001580430815, "adv/std_final_conf": 0.8108200430870056, "adv/std_reasoning": 0.7205343246459961, "adv/std_step_conf": 0.9351160526275635, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6481891610213653, "calib/avg_num_step_conf": 7.31640625, "calib/ece": 0.30758893280632416, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6679841897233202, "calib/gap": 0.28726810838978634, "calib/mean_conf": 0.6830830039525692, "calib/mu_c": 0.7977631578947367, "calib/mu_w": 0.5104950495049504, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19494071146245062, "calib/std_conf": 0.4509594306920598, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5780399619410086, "calib/step_q_c_n": 1051.0, "calib/step_q_gap": 0.07975529040816182, "calib/step_q_w": 0.4982846715328468, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2507.0, "completions/max_terminated_length": 2507.0, "completions/mean_length": 547.1875, "completions/mean_terminated_length": 549.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.11413333333333334, "grad_norm": 0.04952119290828705, "learning_rate": 2.5833333333333337e-06, "loss": 0.0278, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.029747068881988525, "mask/share_reasoning": 0.8243297338485718, "mask/share_step_conf": 0.1420169472694397, "num_tokens": 26284308.0, "reward": 1.3272364139556885, "reward_std": 0.2504537105560303, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6797593832015991, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8291535377502441, "step": 107 }, { "adv/mean_abs_final_conf": 0.548678457736969, "adv/mean_abs_reasoning": 0.40821319818496704, "adv/mean_abs_step_conf": 0.7383649349212646, "adv/ratio_final_to_reasoning": 1.3440977905088585, "adv/ratio_step_to_reasoning": 1.8087728133344216, "adv/std_final_conf": 0.792629063129425, "adv/std_reasoning": 0.7014238834381104, "adv/std_step_conf": 0.9351604580879211, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7042369395310573, "calib/avg_num_step_conf": 7.74609375, "calib/ece": 0.2576583333333333, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6547619047619048, "calib/gap": 0.3955978033730976, "calib/mean_conf": 0.6669448412698411, "calib/mu_c": 0.7689839572192514, "calib/mu_w": 0.3733861538461538, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09126984126984128, "calib/std_conf": 0.4603743121794018, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6037661351556569, "calib/step_q_c_n": 1317.0, "calib/step_q_gap": 0.11863100002052174, "calib/step_q_w": 0.4851351351351352, "calib/step_q_w_n": 666.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2240.0, "completions/max_terminated_length": 2240.0, "completions/mean_length": 591.82421875, "completions/mean_terminated_length": 596.4842529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.1152, "grad_norm": 0.04725135862827301, "learning_rate": 2.5555555555555557e-06, "loss": -0.04, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.028656262904405594, "mask/share_reasoning": 0.8208492994308472, "mask/share_step_conf": 0.14268191158771515, "num_tokens": 26539047.0, "reward": 1.3706966638565063, "reward_std": 0.2445434033870697, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.7252894043922424, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8373488187789917, "step": 108 }, { "adv/mean_abs_final_conf": 0.45445898175239563, "adv/mean_abs_reasoning": 0.32057422399520874, "adv/mean_abs_step_conf": 0.750169038772583, "adv/ratio_final_to_reasoning": 1.417640433122246, "adv/ratio_step_to_reasoning": 2.3400790912740224, "adv/std_final_conf": 0.7065565586090088, "adv/std_reasoning": 0.6186202168464661, "adv/std_step_conf": 0.9343803524971008, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7798996655518395, "calib/avg_num_step_conf": 9.0859375, "calib/ece": 0.22836734693877553, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5061224489795918, "calib/gap": 0.501391304347826, "calib/mean_conf": 0.5226530612244898, "calib/mu_c": 0.7579999999999999, "calib/mu_w": 0.25660869565217387, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11020408163265306, "calib/std_conf": 0.4844225491903719, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5867817307692307, "calib/step_q_c_n": 1040.0, "calib/step_q_gap": 0.20645824709893518, "calib/step_q_w": 0.3803234836702955, "calib/step_q_w_n": 1286.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3071.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 640.05078125, "completions/mean_terminated_length": 650.2103271484375, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.11626666666666667, "grad_norm": 0.04770355299115181, "learning_rate": 2.5277777777777778e-06, "loss": 0.019, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.02566307783126831, "mask/share_reasoning": 0.8131533861160278, "mask/share_step_conf": 0.14555855095386505, "num_tokens": 26807500.0, "reward": 1.3314653635025024, "reward_std": 0.21549201011657715, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7329742312431335, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.818884551525116, "step": 109 }, { "adv/mean_abs_final_conf": 0.5815128087997437, "adv/mean_abs_reasoning": 0.4468216001987457, "adv/mean_abs_step_conf": 0.7596118450164795, "adv/ratio_final_to_reasoning": 1.3014429216069399, "adv/ratio_step_to_reasoning": 1.700033849479535, "adv/std_final_conf": 0.8087716698646545, "adv/std_reasoning": 0.7014685273170471, "adv/std_step_conf": 0.9353334307670593, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6179768203577727, "calib/avg_num_step_conf": 7.109375, "calib/ece": 0.38184313725490193, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5098039215686274, "calib/gap": 0.1932275132275133, "calib/mean_conf": 0.5376862745098039, "calib/mu_c": 0.6195238095238095, "calib/mu_w": 0.4262962962962962, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1715294117647059, "calib/std_conf": 0.47674276741467103, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6170449897750512, "calib/step_q_c_n": 978.0, "calib/step_q_gap": 0.10406399215034812, "calib/step_q_w": 0.5129809976247031, "calib/step_q_w_n": 842.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 542.19921875, "completions/mean_terminated_length": 544.3255004882812, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.11733333333333333, "grad_norm": 0.04419422522187233, "learning_rate": 2.5e-06, "loss": -0.0314, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.030844062566757202, "mask/share_reasoning": 0.8253611922264099, "mask/share_step_conf": 0.1398884356021881, "num_tokens": 27051223.0, "reward": 1.2631927728652954, "reward_std": 0.21499332785606384, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6189863085746765, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7970589399337769, "step": 110 }, { "adv/mean_abs_final_conf": 0.5841657519340515, "adv/mean_abs_reasoning": 0.43651944398880005, "adv/mean_abs_step_conf": 0.7689560651779175, "adv/ratio_final_to_reasoning": 1.3382353523501687, "adv/ratio_step_to_reasoning": 1.7615620008845398, "adv/std_final_conf": 0.7941992282867432, "adv/std_reasoning": 0.7013795375823975, "adv/std_step_conf": 0.9354787468910217, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6561019782669266, "calib/avg_num_step_conf": 7.54296875, "calib/ece": 0.33289795918367343, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6244897959183674, "calib/gap": 0.2811744218445249, "calib/mean_conf": 0.6323265306122449, "calib/mu_c": 0.7436486486486485, "calib/mu_w": 0.46247422680412364, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18057142857142855, "calib/std_conf": 0.4751634471853076, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5799950347567031, "calib/step_q_c_n": 1007.0, "calib/step_q_gap": 0.18059027285194124, "calib/step_q_w": 0.3994047619047619, "calib/step_q_w_n": 924.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3038.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 632.640625, "completions/mean_terminated_length": 635.12158203125, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.1184, "grad_norm": 0.05241648480296135, "learning_rate": 2.4722222222222226e-06, "loss": 0.0322, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.029879005625844002, "mask/share_reasoning": 0.8296252489089966, "mask/share_step_conf": 0.13658946752548218, "num_tokens": 27320587.0, "reward": 1.2603142261505127, "reward_std": 0.28884023427963257, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6361085772514343, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7887442708015442, "step": 111 }, { "adv/mean_abs_final_conf": 0.5549716353416443, "adv/mean_abs_reasoning": 0.4541664123535156, "adv/mean_abs_step_conf": 0.7850562334060669, "adv/ratio_final_to_reasoning": 1.2219565785716087, "adv/ratio_step_to_reasoning": 1.7285651515660565, "adv/std_final_conf": 0.7948229312896729, "adv/std_reasoning": 0.7392616271972656, "adv/std_step_conf": 0.9357273578643799, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7439711757990867, "calib/avg_num_step_conf": 7.51953125, "calib/ece": 0.28107438016528924, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.4462809917355372, "calib/gap": 0.45555793378995435, "calib/mean_conf": 0.4566115702479338, "calib/mu_c": 0.6373287671232877, "calib/mu_w": 0.18177083333333333, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.0671900826446281, "calib/std_conf": 0.48851952720558744, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5791213270142179, "calib/step_q_c_n": 1055.0, "calib/step_q_gap": 0.19313282126709147, "calib/step_q_w": 0.38598850574712645, "calib/step_q_w_n": 870.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3037.0, "completions/max_terminated_length": 3037.0, "completions/mean_length": 631.64453125, "completions/mean_terminated_length": 644.2271118164062, "completions/min_length": 0.0, "completions/min_terminated_length": 239.0, "epoch": 0.11946666666666667, "grad_norm": 0.11762329190969467, "learning_rate": 2.4444444444444447e-06, "loss": -0.0457, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.02612355723977089, "mask/share_reasoning": 0.8241636753082275, "mask/share_step_conf": 0.13018152117729187, "num_tokens": 27590208.0, "reward": 1.282677412033081, "reward_std": 0.27923113107681274, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6753547191619873, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.7934376001358032, "step": 112 }, { "adv/mean_abs_final_conf": 0.5706948041915894, "adv/mean_abs_reasoning": 0.32284876704216003, "adv/mean_abs_step_conf": 0.7501890063285828, "adv/ratio_final_to_reasoning": 1.7676846327155515, "adv/ratio_step_to_reasoning": 2.32365454947089, "adv/std_final_conf": 0.7929322123527527, "adv/std_reasoning": 0.5961199998855591, "adv/std_step_conf": 0.9351218938827515, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7311046511627907, "calib/avg_num_step_conf": 7.78125, "calib/ece": 0.2763779527559055, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5196850393700787, "calib/gap": 0.4366860465116278, "calib/mean_conf": 0.5346456692913386, "calib/mu_c": 0.6824999999999999, "calib/mu_w": 0.2458139534883721, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0748031496062992, "calib/std_conf": 0.48556614456627994, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6204979591836733, "calib/step_q_c_n": 1225.0, "calib/step_q_gap": 0.1989334220259158, "calib/step_q_w": 0.42156453715775755, "calib/step_q_w_n": 767.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2499.0, "completions/max_terminated_length": 2499.0, "completions/mean_length": 558.078125, "completions/mean_terminated_length": 560.2667236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 222.0, "epoch": 0.12053333333333334, "grad_norm": 0.057893529534339905, "learning_rate": 2.4166666666666667e-06, "loss": -0.0251, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0291130430996418, "mask/share_reasoning": 0.8191037178039551, "mask/share_step_conf": 0.14787697792053223, "num_tokens": 27838276.0, "reward": 1.386540174484253, "reward_std": 0.22876328229904175, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.710267186164856, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8669533729553223, "step": 113 }, { "adv/mean_abs_final_conf": 0.4646427035331726, "adv/mean_abs_reasoning": 0.3929080367088318, "adv/mean_abs_step_conf": 0.7613837718963623, "adv/ratio_final_to_reasoning": 1.182573681681906, "adv/ratio_step_to_reasoning": 1.9378167427524342, "adv/std_final_conf": 0.7412422895431519, "adv/std_reasoning": 0.6816239953041077, "adv/std_step_conf": 0.9348096251487732, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.798599012775842, "calib/avg_num_step_conf": 7.63671875, "calib/ece": 0.16599959999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.696, "calib/gap": 0.5592514663182346, "calib/mean_conf": 0.7072803999999999, "calib/mu_c": 0.8907148809523809, "calib/mu_w": 0.33146341463414636, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.10064, "calib/std_conf": 0.44446557101291884, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6734562607204116, "calib/step_q_c_n": 1166.0, "calib/step_q_gap": 0.24828515805881463, "calib/step_q_w": 0.425171102661597, "calib/step_q_w_n": 789.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2005.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 554.14453125, "completions/mean_terminated_length": 567.4440307617188, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.1216, "grad_norm": 0.04622293636202812, "learning_rate": 2.388888888888889e-06, "loss": -0.1354, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.030705967918038368, "mask/share_reasoning": 0.8009713888168335, "mask/share_step_conf": 0.14488518238067627, "num_tokens": 28085161.0, "reward": 1.4095648527145386, "reward_std": 0.2490803748369217, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.8027671575546265, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8460718989372253, "step": 114 }, { "adv/mean_abs_final_conf": 0.48520588874816895, "adv/mean_abs_reasoning": 0.36930257081985474, "adv/mean_abs_step_conf": 0.802715539932251, "adv/ratio_final_to_reasoning": 1.3138437884984333, "adv/ratio_step_to_reasoning": 2.1735985702731933, "adv/std_final_conf": 0.7171012163162231, "adv/std_reasoning": 0.6187039017677307, "adv/std_step_conf": 0.935295581817627, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6699300699300699, "calib/avg_num_step_conf": 7.79296875, "calib/ece": 0.2988134387351779, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7351778656126482, "calib/gap": 0.33630048951048963, "calib/mean_conf": 0.743083794466403, "calib/mu_c": 0.8893013986013987, "calib/mu_w": 0.553000909090909, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23833992094861664, "calib/std_conf": 0.4289959877253122, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.651243830207305, "calib/step_q_c_n": 1013.0, "calib/step_q_gap": 0.1571603271523151, "calib/step_q_w": 0.49408350305498994, "calib/step_q_w_n": 982.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 528.60546875, "completions/mean_terminated_length": 534.87353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.12266666666666666, "grad_norm": 0.03641931712627411, "learning_rate": 2.361111111111111e-06, "loss": -0.0446, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.029507558792829514, "mask/share_reasoning": 0.8150744438171387, "mask/share_step_conf": 0.14369924366474152, "num_tokens": 28325748.0, "reward": 1.2941392660140991, "reward_std": 0.24585412442684174, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6956202983856201, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7916415929794312, "step": 115 }, { "adv/mean_abs_final_conf": 0.543901264667511, "adv/mean_abs_reasoning": 0.38514238595962524, "adv/mean_abs_step_conf": 0.7905224561691284, "adv/ratio_final_to_reasoning": 1.4122082754208427, "adv/ratio_step_to_reasoning": 2.0525459803637385, "adv/std_final_conf": 0.7767652869224548, "adv/std_reasoning": 0.6614387631416321, "adv/std_step_conf": 0.9356350898742676, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6349728629579376, "calib/avg_num_step_conf": 8.4765625, "calib/ece": 0.33713114754098356, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.7704918032786885, "calib/gap": 0.25052645861601086, "calib/mean_conf": 0.781311475409836, "calib/mu_c": 0.8942537313432836, "calib/mu_w": 0.6437272727272727, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.28463114754098356, "calib/std_conf": 0.40257969543904026, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6374273305084746, "calib/step_q_c_n": 944.0, "calib/step_q_gap": 0.1480227628086377, "calib/step_q_w": 0.48940456769983687, "calib/step_q_w_n": 1226.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 607.95703125, "completions/mean_terminated_length": 622.5480346679688, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.12373333333333333, "grad_norm": 0.030581878498196602, "learning_rate": 2.3333333333333336e-06, "loss": -0.0749, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.02716742642223835, "mask/share_reasoning": 0.8129376173019409, "mask/share_step_conf": 0.1364574283361435, "num_tokens": 28585905.0, "reward": 1.233880877494812, "reward_std": 0.3017020523548126, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6295531392097473, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7722293138504028, "step": 116 }, { "adv/mean_abs_final_conf": 0.5185819864273071, "adv/mean_abs_reasoning": 0.3688926100730896, "adv/mean_abs_step_conf": 0.7403764724731445, "adv/ratio_final_to_reasoning": 1.4057803606436008, "adv/ratio_step_to_reasoning": 2.007024408340552, "adv/std_final_conf": 0.7588999271392822, "adv/std_reasoning": 0.6612929105758667, "adv/std_step_conf": 0.9353594779968262, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6061151079136691, "calib/avg_num_step_conf": 7.84765625, "calib/ece": 0.361501581027668, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7905138339920948, "calib/gap": 0.19482906727249794, "calib/mean_conf": 0.803794861660079, "calib/mu_c": 0.8915834532374102, "calib/mu_w": 0.6967543859649122, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3079446640316206, "calib/std_conf": 0.38466862985657124, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6479385530227949, "calib/step_q_c_n": 1009.0, "calib/step_q_gap": 0.12708855302279487, "calib/step_q_w": 0.52085, "calib/step_q_w_n": 1000.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 579.16015625, "completions/mean_terminated_length": 581.431396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 216.0, "epoch": 0.1248, "grad_norm": 0.030159028246998787, "learning_rate": 2.305555555555556e-06, "loss": -0.0532, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02752252295613289, "mask/share_reasoning": 0.8242526650428772, "mask/share_step_conf": 0.14431855082511902, "num_tokens": 28840770.0, "reward": 1.268562912940979, "reward_std": 0.274804949760437, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6287655830383301, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.801445722579956, "step": 117 }, { "adv/mean_abs_final_conf": 0.43278244137763977, "adv/mean_abs_reasoning": 0.2965746819972992, "adv/mean_abs_step_conf": 0.7554448246955872, "adv/ratio_final_to_reasoning": 1.459269680281006, "adv/ratio_step_to_reasoning": 2.547233026123473, "adv/std_final_conf": 0.7201955318450928, "adv/std_reasoning": 0.5961441397666931, "adv/std_step_conf": 0.9351212382316589, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5915766596417281, "calib/avg_num_step_conf": 8.55859375, "calib/ece": 0.3528, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.868, "calib/gap": 0.15894625922023187, "calib/mean_conf": 0.88744, "calib/mu_c": 0.9535616438356165, "calib/mu_w": 0.7946153846153846, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.32812, "calib/std_conf": 0.303349050435303, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6735756056808688, "calib/step_q_c_n": 1197.0, "calib/step_q_gap": 0.1388673561838869, "calib/step_q_w": 0.5347082494969819, "calib/step_q_w_n": 994.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2845.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 653.11328125, "completions/mean_terminated_length": 653.11328125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.12586666666666665, "grad_norm": 0.05803324282169342, "learning_rate": 2.277777777777778e-06, "loss": -0.0084, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02645675651729107, "mask/share_reasoning": 0.828969419002533, "mask/share_step_conf": 0.14457379281520844, "num_tokens": 29111975.0, "reward": 1.26542329788208, "reward_std": 0.23771917819976807, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6349507570266724, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.793260395526886, "step": 118 }, { "adv/mean_abs_final_conf": 0.5820865631103516, "adv/mean_abs_reasoning": 0.4483203887939453, "adv/mean_abs_step_conf": 0.789027214050293, "adv/ratio_final_to_reasoning": 1.2983718288527073, "adv/ratio_step_to_reasoning": 1.7599628162638428, "adv/std_final_conf": 0.7938616275787354, "adv/std_reasoning": 0.7014293074607849, "adv/std_step_conf": 0.9352133274078369, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6387527233115468, "calib/avg_num_step_conf": 8.3984375, "calib/ece": 0.3209756097560975, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7357723577235772, "calib/gap": 0.250061274509804, "calib/mean_conf": 0.755691056910569, "calib/mu_c": 0.859375, "calib/mu_w": 0.609313725490196, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.245650406504065, "calib/std_conf": 0.41067083984684005, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6519414893617023, "calib/step_q_c_n": 1128.0, "calib/step_q_gap": 0.16325254611316997, "calib/step_q_w": 0.4886889432485323, "calib/step_q_w_n": 1022.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 629.90625, "completions/mean_terminated_length": 645.0240478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.12693333333333334, "grad_norm": 0.03724614530801773, "learning_rate": 2.25e-06, "loss": -0.1336, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.02622862160205841, "mask/share_reasoning": 0.8168337345123291, "mask/share_step_conf": 0.13350018858909607, "num_tokens": 29378295.0, "reward": 1.276491403579712, "reward_std": 0.3191832900047302, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6544101238250732, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7969425916671753, "step": 119 }, { "adv/mean_abs_final_conf": 0.5171021223068237, "adv/mean_abs_reasoning": 0.42247337102890015, "adv/mean_abs_step_conf": 0.7612936496734619, "adv/ratio_final_to_reasoning": 1.2239874930991812, "adv/ratio_step_to_reasoning": 1.80199203518885, "adv/std_final_conf": 0.7414100170135498, "adv/std_reasoning": 0.7014121413230896, "adv/std_step_conf": 0.9345976710319519, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6686746987951807, "calib/avg_num_step_conf": 7.640625, "calib/ece": 0.24758064516129036, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7862903225806451, "calib/gap": 0.3137685286600951, "calib/mean_conf": 0.8005645161290323, "calib/mu_c": 0.9055757575757577, "calib/mu_w": 0.5918072289156626, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19141129032258067, "calib/std_conf": 0.3854789698028282, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6643690165361184, "calib/step_q_c_n": 1149.0, "calib/step_q_gap": 0.18397248617676265, "calib/step_q_w": 0.4803965303593557, "calib/step_q_w_n": 807.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2875.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 547.33984375, "completions/mean_terminated_length": 558.2430419921875, "completions/min_length": 0.0, "completions/min_terminated_length": 238.0, "epoch": 0.128, "grad_norm": 0.03176447004079819, "learning_rate": 2.222222222222222e-06, "loss": -0.0537, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.028437182307243347, "mask/share_reasoning": 0.8139419555664062, "mask/share_step_conf": 0.13808965682983398, "num_tokens": 29625102.0, "reward": 1.3524608612060547, "reward_std": 0.2827880382537842, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7267366647720337, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8277642726898193, "step": 120 }, { "adv/mean_abs_final_conf": 0.641068160533905, "adv/mean_abs_reasoning": 0.4862363636493683, "adv/mean_abs_step_conf": 0.7601180076599121, "adv/ratio_final_to_reasoning": 1.3184290778305263, "adv/ratio_step_to_reasoning": 1.5632685345764137, "adv/std_final_conf": 0.858704686164856, "adv/std_reasoning": 0.7575821876525879, "adv/std_step_conf": 0.9351223111152649, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6479188166494668, "calib/avg_num_step_conf": 8.3203125, "calib/ece": 0.2900403225806451, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.2653340213278297, "calib/mean_conf": 0.7735887096774193, "calib/mu_c": 0.8752287581699348, "calib/mu_w": 0.6098947368421052, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22334677419354837, "calib/std_conf": 0.39741049940058065, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6422859744990893, "calib/step_q_c_n": 1098.0, "calib/step_q_gap": 0.15888481170839164, "calib/step_q_w": 0.48340116279069767, "calib/step_q_w_n": 1032.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2712.0, "completions/max_terminated_length": 2712.0, "completions/mean_length": 626.875, "completions/mean_terminated_length": 639.362548828125, "completions/min_length": 0.0, "completions/min_terminated_length": 219.0, "epoch": 0.12906666666666666, "grad_norm": 0.030280577018857002, "learning_rate": 2.1944444444444445e-06, "loss": -0.0602, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.026005305349826813, "mask/share_reasoning": 0.8234038352966309, "mask/share_step_conf": 0.13105961680412292, "num_tokens": 29890638.0, "reward": 1.3218499422073364, "reward_std": 0.308574378490448, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6845276951789856, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8225547671318054, "step": 121 }, { "adv/mean_abs_final_conf": 0.5707475543022156, "adv/mean_abs_reasoning": 0.43730437755584717, "adv/mean_abs_step_conf": 0.7497875094413757, "adv/ratio_final_to_reasoning": 1.3051494190206836, "adv/ratio_step_to_reasoning": 1.714566667802501, "adv/std_final_conf": 0.7942202687263489, "adv/std_reasoning": 0.720486581325531, "adv/std_step_conf": 0.9346208572387695, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7199184463335407, "calib/avg_num_step_conf": 8.484375, "calib/ece": 0.2172, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.744, "calib/gap": 0.40112101734743244, "calib/mean_conf": 0.7680799999999999, "calib/mu_c": 0.9140880503144654, "calib/mu_w": 0.5129670329670329, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17464000000000002, "calib/std_conf": 0.4014527538826954, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6792504258943782, "calib/step_q_c_n": 1174.0, "calib/step_q_gap": 0.24258709924107158, "calib/step_q_w": 0.43666332665330665, "calib/step_q_w_n": 998.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2677.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 617.8203125, "completions/mean_terminated_length": 622.68505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 244.0, "epoch": 0.13013333333333332, "grad_norm": 0.03379069268703461, "learning_rate": 2.166666666666667e-06, "loss": 0.0004, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.026336126029491425, "mask/share_reasoning": 0.8244964480400085, "mask/share_step_conf": 0.14135491847991943, "num_tokens": 30156144.0, "reward": 1.3796439170837402, "reward_std": 0.3015453815460205, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7574304342269897, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8411630392074585, "step": 122 }, { "adv/mean_abs_final_conf": 0.6780246496200562, "adv/mean_abs_reasoning": 0.49683240056037903, "adv/mean_abs_step_conf": 0.7450475692749023, "adv/ratio_final_to_reasoning": 1.364694912922969, "adv/ratio_step_to_reasoning": 1.4995953734791865, "adv/std_final_conf": 0.8590086698532104, "adv/std_reasoning": 0.7394485473632812, "adv/std_step_conf": 0.9357582330703735, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6953206465067779, "calib/avg_num_step_conf": 8.56640625, "calib/ece": 0.27651405622489966, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5742971887550201, "calib/gap": 0.3350961939520333, "calib/mean_conf": 0.6227630522088354, "calib/mu_c": 0.7734890510948904, "calib/mu_w": 0.4383928571428571, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17453815261044187, "calib/std_conf": 0.4485328292063186, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6023872180451129, "calib/step_q_c_n": 1064.0, "calib/step_q_gap": 0.15375125701765496, "calib/step_q_w": 0.4486359610274579, "calib/step_q_w_n": 1129.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2769.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 726.8359375, "completions/mean_terminated_length": 726.8359375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.1312, "grad_norm": 0.035880621522665024, "learning_rate": 2.138888888888889e-06, "loss": 0.0256, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.023776985704898834, "mask/share_reasoning": 0.8434326648712158, "mask/share_step_conf": 0.13279032707214355, "num_tokens": 30447502.0, "reward": 1.3135013580322266, "reward_std": 0.29589012265205383, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6924653649330139, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8164873719215393, "step": 123 }, { "adv/mean_abs_final_conf": 0.5248005390167236, "adv/mean_abs_reasoning": 0.33113977313041687, "adv/mean_abs_step_conf": 0.7580268383026123, "adv/ratio_final_to_reasoning": 1.5848308829094806, "adv/ratio_step_to_reasoning": 2.2891446446817163, "adv/std_final_conf": 0.7578999400138855, "adv/std_reasoning": 0.6185828447341919, "adv/std_step_conf": 0.935204803943634, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6887034659820281, "calib/avg_num_step_conf": 8.40234375, "calib/ece": 0.29213438735177866, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5810276679841897, "calib/gap": 0.3372108115818001, "calib/mean_conf": 0.6046245059288538, "calib/mu_c": 0.7139181286549708, "calib/mu_w": 0.37670731707317073, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11043478260869564, "calib/std_conf": 0.4690166709833766, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5899855177407676, "calib/step_q_c_n": 1381.0, "calib/step_q_gap": 0.18094655670180648, "calib/step_q_w": 0.4090389610389611, "calib/step_q_w_n": 770.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 630.87890625, "completions/mean_terminated_length": 633.3529663085938, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.13226666666666667, "grad_norm": 0.03069070540368557, "learning_rate": 2.1111111111111114e-06, "loss": -0.0143, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.026175033301115036, "mask/share_reasoning": 0.8293299674987793, "mask/share_step_conf": 0.14058873057365417, "num_tokens": 30715823.0, "reward": 1.3607532978057861, "reward_std": 0.23503340780735016, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.6918461322784424, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8495957851409912, "step": 124 }, { "adv/mean_abs_final_conf": 0.6436437368392944, "adv/mean_abs_reasoning": 0.5224178433418274, "adv/mean_abs_step_conf": 0.7601853013038635, "adv/ratio_final_to_reasoning": 1.2320477660602163, "adv/ratio_step_to_reasoning": 1.455128899198913, "adv/std_final_conf": 0.8280212879180908, "adv/std_reasoning": 0.7755022048950195, "adv/std_step_conf": 0.9357369542121887, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6191798941798942, "calib/avg_num_step_conf": 8.4375, "calib/ece": 0.35554655870445345, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.582995951417004, "calib/gap": 0.20311375661375675, "calib/mean_conf": 0.617085020242915, "calib/mu_c": 0.7091851851851853, "calib/mu_w": 0.5060714285714285, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21303643724696356, "calib/std_conf": 0.45976539174194386, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6052729044834307, "calib/step_q_c_n": 1026.0, "calib/step_q_gap": 0.17805950060336018, "calib/step_q_w": 0.42721340388007056, "calib/step_q_w_n": 1134.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3008.0, "completions/max_terminated_length": 3008.0, "completions/mean_length": 684.1484375, "completions/mean_terminated_length": 689.535400390625, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.13333333333333333, "grad_norm": 0.03402207791805267, "learning_rate": 2.0833333333333334e-06, "loss": 0.0411, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02608790621161461, "mask/share_reasoning": 0.8333162069320679, "mask/share_step_conf": 0.13278338313102722, "num_tokens": 30995773.0, "reward": 1.2464786767959595, "reward_std": 0.30777016282081604, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6141093969345093, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7902053594589233, "step": 125 }, { "adv/mean_abs_final_conf": 0.5593317151069641, "adv/mean_abs_reasoning": 0.42206138372421265, "adv/mean_abs_step_conf": 0.759347677230835, "adv/ratio_final_to_reasoning": 1.325237836666071, "adv/ratio_step_to_reasoning": 1.7991403774741332, "adv/std_final_conf": 0.7963278889656067, "adv/std_reasoning": 0.7015090584754944, "adv/std_step_conf": 0.9353612661361694, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7362362714728246, "calib/avg_num_step_conf": 9.1640625, "calib/ece": 0.23920833333333336, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.5291666666666667, "calib/gap": 0.4673176569980287, "calib/mean_conf": 0.5505416666666666, "calib/mu_c": 0.7569402985074626, "calib/mu_w": 0.2896226415094339, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1157083333333333, "calib/std_conf": 0.47981867053838395, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5592987512007686, "calib/step_q_c_n": 1041.0, "calib/step_q_gap": 0.24736695043448503, "calib/step_q_w": 0.3119318007662835, "calib/step_q_w_n": 1305.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2628.0, "completions/max_terminated_length": 2628.0, "completions/mean_length": 612.140625, "completions/mean_terminated_length": 637.0243530273438, "completions/min_length": 0.0, "completions/min_terminated_length": 217.0, "epoch": 0.1344, "grad_norm": 0.024109316989779472, "learning_rate": 2.0555555555555555e-06, "loss": -0.1785, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.0260985866189003, "mask/share_reasoning": 0.7924785017967224, "mask/share_step_conf": 0.1423604041337967, "num_tokens": 31257945.0, "reward": 1.302736759185791, "reward_std": 0.285793662071228, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.706494927406311, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8033955097198486, "step": 126 }, { "adv/mean_abs_final_conf": 0.7272650599479675, "adv/mean_abs_reasoning": 0.491754412651062, "adv/mean_abs_step_conf": 0.7553401589393616, "adv/ratio_final_to_reasoning": 1.4789192353704788, "adv/ratio_step_to_reasoning": 1.5360109426721793, "adv/std_final_conf": 0.8917009234428406, "adv/std_reasoning": 0.7577455043792725, "adv/std_step_conf": 0.9358600974082947, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.7085182534001431, "calib/avg_num_step_conf": 9.0, "calib/ece": 0.2907172995780591, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.45569620253164556, "calib/gap": 0.3459642090193272, "calib/mean_conf": 0.4984810126582278, "calib/mu_c": 0.6590551181102362, "calib/mu_w": 0.31309090909090903, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12666666666666668, "calib/std_conf": 0.46878446445254, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5359112825458052, "calib/step_q_c_n": 1037.0, "calib/step_q_gap": 0.22067055642110123, "calib/step_q_w": 0.315240726124704, "calib/step_q_w_n": 1267.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2900.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 640.9140625, "completions/mean_terminated_length": 661.5886840820312, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.13546666666666668, "grad_norm": 0.032043445855379105, "learning_rate": 2.027777777777778e-06, "loss": -0.0016, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.026310283690690994, "mask/share_reasoning": 0.8025298118591309, "mask/share_step_conf": 0.13990992307662964, "num_tokens": 31525691.0, "reward": 1.245164394378662, "reward_std": 0.34896039962768555, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6501039266586304, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7783154845237732, "step": 127 }, { "adv/mean_abs_final_conf": 0.6428596377372742, "adv/mean_abs_reasoning": 0.5563979148864746, "adv/mean_abs_step_conf": 0.7521604299545288, "adv/ratio_final_to_reasoning": 1.1553954832279356, "adv/ratio_step_to_reasoning": 1.3518390522868815, "adv/std_final_conf": 0.8436455130577087, "adv/std_reasoning": 0.7928293347358704, "adv/std_step_conf": 0.9357814192771912, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6912990116917511, "calib/avg_num_step_conf": 6.9765625, "calib/ece": 0.283064, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.492, "calib/gap": 0.3543325366578386, "calib/mean_conf": 0.526776, "calib/mu_c": 0.6869343065693431, "calib/mu_w": 0.3326017699115045, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13092000000000004, "calib/std_conf": 0.47018616507081534, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5741783029001075, "calib/step_q_c_n": 931.0, "calib/step_q_gap": 0.1583537414965987, "calib/step_q_w": 0.4158245614035088, "calib/step_q_w_n": 855.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2654.0, "completions/max_terminated_length": 2654.0, "completions/mean_length": 579.63671875, "completions/mean_terminated_length": 588.8373413085938, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.13653333333333334, "grad_norm": 0.024526173248887062, "learning_rate": 2.0000000000000003e-06, "loss": -0.0954, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02894548699259758, "mask/share_reasoning": 0.8233139514923096, "mask/share_step_conf": 0.13211554288864136, "num_tokens": 31780742.0, "reward": 1.3008251190185547, "reward_std": 0.3150237202644348, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6897578239440918, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8051649332046509, "step": 128 }, { "adv/mean_abs_final_conf": 0.6891508102416992, "adv/mean_abs_reasoning": 0.4242488145828247, "adv/mean_abs_step_conf": 0.7488288879394531, "adv/ratio_final_to_reasoning": 1.6244024415704255, "adv/ratio_step_to_reasoning": 1.7650700772748105, "adv/std_final_conf": 0.8731085658073425, "adv/std_reasoning": 0.7013865113258362, "adv/std_step_conf": 0.9358651638031006, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6691920885015086, "calib/avg_num_step_conf": 8.26171875, "calib/ece": 0.37857142857142856, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4365079365079365, "calib/gap": 0.21326315789473688, "calib/mean_conf": 0.4896031746031746, "calib/mu_c": 0.57, "calib/mu_w": 0.3567368421052631, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1225793650793651, "calib/std_conf": 0.4602706256962427, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.50537734375, "calib/step_q_c_n": 1280.0, "calib/step_q_gap": 0.0858204575224551, "calib/step_q_w": 0.4195568862275449, "calib/step_q_w_n": 835.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2562.0, "completions/max_terminated_length": 2562.0, "completions/mean_length": 601.7578125, "completions/mean_terminated_length": 604.11767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.1376, "grad_norm": 0.048718493431806564, "learning_rate": 1.9722222222222224e-06, "loss": 0.0562, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.027454271912574768, "mask/share_reasoning": 0.822243332862854, "mask/share_step_conf": 0.1463961899280548, "num_tokens": 32037176.0, "reward": 1.2862615585327148, "reward_std": 0.27578479051589966, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6257296800613403, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8136311769485474, "step": 129 }, { "adv/mean_abs_final_conf": 0.5660302639007568, "adv/mean_abs_reasoning": 0.32305261492729187, "adv/mean_abs_step_conf": 0.7698035836219788, "adv/ratio_final_to_reasoning": 1.7521302653072501, "adv/ratio_step_to_reasoning": 2.3829046664588533, "adv/std_final_conf": 0.7834324836730957, "adv/std_reasoning": 0.5962008833885193, "adv/std_step_conf": 0.9351350665092468, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7171706105755186, "calib/avg_num_step_conf": 7.0234375, "calib/ece": 0.2849392712550607, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5506072874493927, "calib/gap": 0.37632632193981885, "calib/mean_conf": 0.5885829959514169, "calib/mu_c": 0.7165644171779141, "calib/mu_w": 0.34023809523809523, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10680161943319838, "calib/std_conf": 0.4692858043757504, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6083996212121212, "calib/step_q_c_n": 1056.0, "calib/step_q_gap": 0.25159369129298376, "calib/step_q_w": 0.35680592991913745, "calib/step_q_w_n": 742.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3031.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 562.48046875, "completions/mean_terminated_length": 569.1502075195312, "completions/min_length": 0.0, "completions/min_terminated_length": 205.0, "epoch": 0.13866666666666666, "grad_norm": 0.026100866496562958, "learning_rate": 1.944444444444445e-06, "loss": -0.0812, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.029097583144903183, "mask/share_reasoning": 0.8250260949134827, "mask/share_step_conf": 0.13415755331516266, "num_tokens": 32286459.0, "reward": 1.3368961811065674, "reward_std": 0.2628602981567383, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.69388747215271, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8297960758209229, "step": 130 }, { "adv/mean_abs_final_conf": 0.6387416124343872, "adv/mean_abs_reasoning": 0.360165536403656, "adv/mean_abs_step_conf": 0.7710878849029541, "adv/ratio_final_to_reasoning": 1.7734667753399833, "adv/ratio_step_to_reasoning": 2.140926343487669, "adv/std_final_conf": 0.8598631620407104, "adv/std_reasoning": 0.6611831188201904, "adv/std_step_conf": 0.9360483884811401, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7330898268398269, "calib/avg_num_step_conf": 7.390625, "calib/ece": 0.25274590163934435, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.4139344262295082, "calib/gap": 0.38307359307359296, "calib/mean_conf": 0.46454918032786885, "calib/mu_c": 0.6717857142857142, "calib/mu_w": 0.28871212121212125, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.12913934426229515, "calib/std_conf": 0.4619455046005314, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5429871794871794, "calib/step_q_c_n": 780.0, "calib/step_q_gap": 0.1899745895591219, "calib/step_q_w": 0.35301258992805756, "calib/step_q_w_n": 1112.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2298.0, "completions/max_terminated_length": 2298.0, "completions/mean_length": 585.35546875, "completions/mean_terminated_length": 601.8112182617188, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.13973333333333332, "grad_norm": 0.043049655854701996, "learning_rate": 1.916666666666667e-06, "loss": -0.1359, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.026013091206550598, "mask/share_reasoning": 0.8193711042404175, "mask/share_step_conf": 0.1272720843553543, "num_tokens": 32542518.0, "reward": 1.2662631273269653, "reward_std": 0.33393311500549316, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.686969518661499, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7844971418380737, "step": 131 }, { "adv/mean_abs_final_conf": 0.5512483716011047, "adv/mean_abs_reasoning": 0.46704182028770447, "adv/mean_abs_step_conf": 0.7561357021331787, "adv/ratio_final_to_reasoning": 1.1802976685503834, "adv/ratio_step_to_reasoning": 1.6189892838020121, "adv/std_final_conf": 0.7938889265060425, "adv/std_reasoning": 0.7393816709518433, "adv/std_step_conf": 0.9344714879989624, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8069264069264069, "calib/avg_num_step_conf": 7.61328125, "calib/ece": 0.1663855421686748, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5662650602409639, "calib/gap": 0.5592402597402596, "calib/mean_conf": 0.6087951807228916, "calib/mu_c": 0.7974545454545453, "calib/mu_w": 0.23821428571428574, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.056265060240963956, "calib/std_conf": 0.4545761897261351, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6008884859474161, "calib/step_q_c_n": 1103.0, "calib/step_q_gap": 0.28548659469446097, "calib/step_q_w": 0.3154018912529551, "calib/step_q_w_n": 846.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2979.0, "completions/max_terminated_length": 2979.0, "completions/mean_length": 631.93359375, "completions/mean_terminated_length": 639.4268798828125, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.1408, "grad_norm": 0.03704225271940231, "learning_rate": 1.888888888888889e-06, "loss": -0.0146, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0284785944968462, "mask/share_reasoning": 0.8273619413375854, "mask/share_step_conf": 0.1324407309293747, "num_tokens": 32809885.0, "reward": 1.4165544509887695, "reward_std": 0.250224232673645, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.794607400894165, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8579226732254028, "step": 132 }, { "adv/mean_abs_final_conf": 0.7426465749740601, "adv/mean_abs_reasoning": 0.522463858127594, "adv/mean_abs_step_conf": 0.7531304359436035, "adv/ratio_final_to_reasoning": 1.421431479749732, "adv/ratio_step_to_reasoning": 1.4414976734327087, "adv/std_final_conf": 0.8922584056854248, "adv/std_reasoning": 0.7756352424621582, "adv/std_step_conf": 0.9363987445831299, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.727751263335205, "calib/avg_num_step_conf": 9.1328125, "calib/ece": 0.28414937759336106, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.4605809128630705, "calib/gap": 0.36234348680516554, "calib/mean_conf": 0.5073858921161826, "calib/mu_c": 0.7133653846153846, "calib/mu_w": 0.35102189781021903, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18000000000000008, "calib/std_conf": 0.4655976087191073, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5147852760736197, "calib/step_q_c_n": 815.0, "calib/step_q_gap": 0.1762888873671194, "calib/step_q_w": 0.3384963887065003, "calib/step_q_w_n": 1523.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2813.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 719.05078125, "completions/mean_terminated_length": 736.3080444335938, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.14186666666666667, "grad_norm": 0.03149598091840744, "learning_rate": 1.8611111111111113e-06, "loss": -0.0652, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.0236146692186594, "mask/share_reasoning": 0.825226366519928, "mask/share_step_conf": 0.12772145867347717, "num_tokens": 33100306.0, "reward": 1.2592096328735352, "reward_std": 0.399488627910614, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.6683304309844971, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.7898881435394287, "step": 133 }, { "adv/mean_abs_final_conf": 0.7197145223617554, "adv/mean_abs_reasoning": 0.5542685985565186, "adv/mean_abs_step_conf": 0.7648236751556396, "adv/ratio_final_to_reasoning": 1.2984941312499167, "adv/ratio_step_to_reasoning": 1.3798791364826901, "adv/std_final_conf": 0.8917521834373474, "adv/std_reasoning": 0.8099950551986694, "adv/std_step_conf": 0.9360350370407104, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7137681159420289, "calib/avg_num_step_conf": 7.9375, "calib/ece": 0.25962083333333336, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6208333333333333, "calib/gap": 0.3615971867007671, "calib/mean_conf": 0.6703791666666667, "calib/mu_c": 0.8240579710144926, "calib/mu_w": 0.4624607843137255, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.17750000000000005, "calib/std_conf": 0.43895969108712657, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5716602687140115, "calib/step_q_c_n": 1042.0, "calib/step_q_gap": 0.2143956222493651, "calib/step_q_w": 0.35726464646464645, "calib/step_q_w_n": 990.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 712.8828125, "completions/mean_terminated_length": 724.1984252929688, "completions/min_length": 0.0, "completions/min_terminated_length": 229.0, "epoch": 0.14293333333333333, "grad_norm": 0.022994376718997955, "learning_rate": 1.8333333333333333e-06, "loss": -0.1002, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.02312406152486801, "mask/share_reasoning": 0.8467497229576111, "mask/share_step_conf": 0.1145012304186821, "num_tokens": 33391756.0, "reward": 1.2713760137557983, "reward_std": 0.4110109806060791, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6849120259284973, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7812637090682983, "step": 134 }, { "adv/mean_abs_final_conf": 0.565937340259552, "adv/mean_abs_reasoning": 0.5127121210098267, "adv/mean_abs_step_conf": 0.7459043264389038, "adv/ratio_final_to_reasoning": 1.1038111194736222, "adv/ratio_step_to_reasoning": 1.4548209333724875, "adv/std_final_conf": 0.7928846478462219, "adv/std_reasoning": 0.7754972577095032, "adv/std_step_conf": 0.9352908730506897, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6286195286195286, "calib/avg_num_step_conf": 7.7421875, "calib/ece": 0.3159385542168675, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6746987951807228, "calib/gap": 0.23028989898989904, "calib/mean_conf": 0.7117722891566265, "calib/mu_c": 0.8033333333333333, "calib/mu_w": 0.5730434343434343, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2126506024096386, "calib/std_conf": 0.42404818803797983, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.527002700270027, "calib/step_q_c_n": 1111.0, "calib/step_q_gap": 0.08112439946635308, "calib/step_q_w": 0.44587830080367397, "calib/step_q_w_n": 871.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2268.0, "completions/max_terminated_length": 2268.0, "completions/mean_length": 622.4453125, "completions/mean_terminated_length": 629.8261108398438, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.144, "grad_norm": 0.023047512397170067, "learning_rate": 1.8055555555555557e-06, "loss": -0.0883, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.027006974443793297, "mask/share_reasoning": 0.8312538266181946, "mask/share_step_conf": 0.13002046942710876, "num_tokens": 33656982.0, "reward": 1.2960524559020996, "reward_std": 0.3031335771083832, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6604580879211426, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8099638819694519, "step": 135 }, { "adv/mean_abs_final_conf": 0.6360206007957458, "adv/mean_abs_reasoning": 0.503818929195404, "adv/mean_abs_step_conf": 0.7464261054992676, "adv/ratio_final_to_reasoning": 1.2623991754566806, "adv/ratio_step_to_reasoning": 1.4815364454275384, "adv/std_final_conf": 0.8439712524414062, "adv/std_reasoning": 0.7929543852806091, "adv/std_step_conf": 0.9357492327690125, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7431534985622347, "calib/avg_num_step_conf": 8.578125, "calib/ece": 0.2198353909465022, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6090534979423868, "calib/gap": 0.41825961933452, "calib/mean_conf": 0.6647736625514404, "calib/mu_c": 0.8523880597014926, "calib/mu_w": 0.43412844036697257, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.16658436213991784, "calib/std_conf": 0.4341268259791531, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5619423306772907, "calib/step_q_c_n": 1004.0, "calib/step_q_gap": 0.2213466930933981, "calib/step_q_w": 0.3405956375838926, "calib/step_q_w_n": 1192.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2777.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 609.71875, "completions/mean_terminated_length": 624.35205078125, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.14506666666666668, "grad_norm": 0.02454623207449913, "learning_rate": 1.777777777777778e-06, "loss": -0.1694, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.027274496853351593, "mask/share_reasoning": 0.8087484836578369, "mask/share_step_conf": 0.14053946733474731, "num_tokens": 33921558.0, "reward": 1.3231323957443237, "reward_std": 0.3496283292770386, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7197468876838684, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8159933686256409, "step": 136 }, { "adv/mean_abs_final_conf": 0.5607572793960571, "adv/mean_abs_reasoning": 0.37462568283081055, "adv/mean_abs_step_conf": 0.7560116052627563, "adv/ratio_final_to_reasoning": 1.4968468663407357, "adv/ratio_step_to_reasoning": 2.0180453180626925, "adv/std_final_conf": 0.7927778959274292, "adv/std_reasoning": 0.6614594459533691, "adv/std_step_conf": 0.935052752494812, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6742097701149427, "calib/avg_num_step_conf": 8.97265625, "calib/ece": 0.29374999999999996, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6693548387096774, "calib/gap": 0.2998563218390804, "calib/mean_conf": 0.7189112903225806, "calib/mu_c": 0.8591666666666666, "calib/mu_w": 0.5593103448275862, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24020161290322575, "calib/std_conf": 0.4140116780357143, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5718988549618321, "calib/step_q_c_n": 1048.0, "calib/step_q_gap": 0.17152255392099947, "calib/step_q_w": 0.40037630104083266, "calib/step_q_w_n": 1249.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2734.0, "completions/max_terminated_length": 2734.0, "completions/mean_length": 649.9375, "completions/mean_terminated_length": 660.2540283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 240.0, "epoch": 0.14613333333333334, "grad_norm": 0.027212072163820267, "learning_rate": 1.75e-06, "loss": -0.0234, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.024864552542567253, "mask/share_reasoning": 0.8198833465576172, "mask/share_step_conf": 0.139627143740654, "num_tokens": 34194926.0, "reward": 1.2909659147262573, "reward_std": 0.2898910343647003, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6685027480125427, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8090583086013794, "step": 137 }, { "adv/mean_abs_final_conf": 0.5885981321334839, "adv/mean_abs_reasoning": 0.5579100251197815, "adv/mean_abs_step_conf": 0.7548394203186035, "adv/ratio_final_to_reasoning": 1.0550054769263444, "adv/ratio_step_to_reasoning": 1.3529769789609747, "adv/std_final_conf": 0.7946194410324097, "adv/std_reasoning": 0.7931153178215027, "adv/std_step_conf": 0.935364305973053, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.7048546691403834, "calib/avg_num_step_conf": 8.69921875, "calib/ece": 0.22147058823529409, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.7100840336134454, "calib/gap": 0.3526623376623378, "calib/mean_conf": 0.7524789915966387, "calib/mu_c": 0.876948051948052, "calib/mu_w": 0.5242857142857142, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.16344537815126048, "calib/std_conf": 0.40030890065698216, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5912727272727274, "calib/step_q_c_n": 1100.0, "calib/step_q_gap": 0.2486462853916271, "calib/step_q_w": 0.3426264418811003, "calib/step_q_w_n": 1127.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2808.0, "completions/max_terminated_length": 2808.0, "completions/mean_length": 601.58203125, "completions/mean_terminated_length": 626.0365600585938, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.1472, "grad_norm": 0.024571401998400688, "learning_rate": 1.7222222222222224e-06, "loss": -0.1332, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.02681715041399002, "mask/share_reasoning": 0.8012897968292236, "mask/share_step_conf": 0.13283054530620575, "num_tokens": 34453267.0, "reward": 1.292980432510376, "reward_std": 0.37091195583343506, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7039051055908203, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7886841297149658, "step": 138 }, { "adv/mean_abs_final_conf": 0.5235190391540527, "adv/mean_abs_reasoning": 0.3951146602630615, "adv/mean_abs_step_conf": 0.7521218657493591, "adv/ratio_final_to_reasoning": 1.3249800420098345, "adv/ratio_step_to_reasoning": 1.9035534273737336, "adv/std_final_conf": 0.7528417110443115, "adv/std_reasoning": 0.6815641522407532, "adv/std_step_conf": 0.9350164532661438, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7040816326530612, "calib/avg_num_step_conf": 7.5, "calib/ece": 0.2350396825396825, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7857142857142857, "calib/gap": 0.2928571428571429, "calib/mean_conf": 0.8327380952380952, "calib/mu_c": 0.9303571428571429, "calib/mu_w": 0.6375, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20055555555555551, "calib/std_conf": 0.33762952079262504, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.632515404699739, "calib/step_q_c_n": 1149.0, "calib/step_q_gap": 0.15545963297470655, "calib/step_q_w": 0.47705577172503244, "calib/step_q_w_n": 771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2996.0, "completions/max_terminated_length": 2996.0, "completions/mean_length": 580.10546875, "completions/mean_terminated_length": 584.6732177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.14826666666666666, "grad_norm": 0.023692641407251358, "learning_rate": 1.6944444444444446e-06, "loss": 0.0227, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02832508645951748, "mask/share_reasoning": 0.8266885280609131, "mask/share_step_conf": 0.13717389106750488, "num_tokens": 34704870.0, "reward": 1.3846945762634277, "reward_std": 0.28370344638824463, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.751888632774353, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8450783491134644, "step": 139 }, { "adv/mean_abs_final_conf": 0.5185952186584473, "adv/mean_abs_reasoning": 0.4070984721183777, "adv/mean_abs_step_conf": 0.7555028200149536, "adv/ratio_final_to_reasoning": 1.2738815155947039, "adv/ratio_step_to_reasoning": 1.8558232755913304, "adv/std_final_conf": 0.7767248153686523, "adv/std_reasoning": 0.6816426515579224, "adv/std_step_conf": 0.934374213218689, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7118879178804273, "calib/avg_num_step_conf": 7.6171875, "calib/ece": 0.22605458167330678, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8087649402390438, "calib/gap": 0.3221040851713136, "calib/mean_conf": 0.8498019920318725, "calib/mu_c": 0.9640141975308641, "calib/mu_w": 0.6419101123595505, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2152191235059761, "calib/std_conf": 0.324334355103517, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6633154121863799, "calib/step_q_c_n": 1116.0, "calib/step_q_gap": 0.21230222273793858, "calib/step_q_w": 0.4510131894484413, "calib/step_q_w_n": 834.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2684.0, "completions/max_terminated_length": 2684.0, "completions/mean_length": 598.13671875, "completions/mean_terminated_length": 605.229248046875, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.14933333333333335, "grad_norm": 0.030547764152288437, "learning_rate": 1.6666666666666667e-06, "loss": -0.0224, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.027081944048404694, "mask/share_reasoning": 0.8269479274749756, "mask/share_step_conf": 0.13425138592720032, "num_tokens": 34963009.0, "reward": 1.3870867490768433, "reward_std": 0.26042604446411133, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7565398216247559, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8474887013435364, "step": 140 }, { "adv/mean_abs_final_conf": 0.47909656167030334, "adv/mean_abs_reasoning": 0.4322686195373535, "adv/mean_abs_step_conf": 0.7472922801971436, "adv/ratio_final_to_reasoning": 1.1083306537103448, "adv/ratio_step_to_reasoning": 1.7287682853244173, "adv/std_final_conf": 0.7406020760536194, "adv/std_reasoning": 0.7205878496170044, "adv/std_step_conf": 0.9347190260887146, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7789922480620155, "calib/avg_num_step_conf": 7.76171875, "calib/ece": 0.1749797570850204, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7732793522267206, "calib/gap": 0.45480465116279084, "calib/mean_conf": 0.8055060728744938, "calib/mu_c": 0.9436046511627908, "calib/mu_w": 0.48879999999999996, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14206477732793538, "calib/std_conf": 0.3698172857802966, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.614653937947494, "calib/step_q_c_n": 1257.0, "calib/step_q_gap": 0.22399640370091856, "calib/step_q_w": 0.39065753424657546, "calib/step_q_w_n": 730.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2514.0, "completions/max_terminated_length": 2514.0, "completions/mean_length": 641.890625, "completions/mean_terminated_length": 646.9448852539062, "completions/min_length": 0.0, "completions/min_terminated_length": 62.0, "epoch": 0.1504, "grad_norm": 0.02207571640610695, "learning_rate": 1.638888888888889e-06, "loss": -0.1009, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.025763999670743942, "mask/share_reasoning": 0.8380931615829468, "mask/share_step_conf": 0.12833033502101898, "num_tokens": 35234429.0, "reward": 1.4104137420654297, "reward_std": 0.2923741936683655, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.8029515743255615, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8452661633491516, "step": 141 }, { "adv/mean_abs_final_conf": 0.5865595936775208, "adv/mean_abs_reasoning": 0.3907625079154968, "adv/mean_abs_step_conf": 0.746477484703064, "adv/ratio_final_to_reasoning": 1.5010641548148869, "adv/ratio_step_to_reasoning": 1.9103098930475981, "adv/std_final_conf": 0.8279939889907837, "adv/std_reasoning": 0.701130747795105, "adv/std_step_conf": 0.935562014579773, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7004310344827586, "calib/avg_num_step_conf": 7.96875, "calib/ece": 0.29021811023622057, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7362204724409449, "calib/gap": 0.32836880309845107, "calib/mean_conf": 0.7786007874015748, "calib/mu_c": 0.9285644927536233, "calib/mu_w": 0.6001956896551722, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2627559055118111, "calib/std_conf": 0.3905670701666905, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6373262906309752, "calib/step_q_c_n": 1046.0, "calib/step_q_gap": 0.1907065723211161, "calib/step_q_w": 0.44661971830985914, "calib/step_q_w_n": 994.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 624.6328125, "completions/mean_terminated_length": 627.0823974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.15146666666666667, "grad_norm": 0.02521516941487789, "learning_rate": 1.6111111111111113e-06, "loss": 0.0182, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02745303511619568, "mask/share_reasoning": 0.8278377056121826, "mask/share_step_conf": 0.14080297946929932, "num_tokens": 35499495.0, "reward": 1.3193681240081787, "reward_std": 0.2825583815574646, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.701399564743042, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8155432939529419, "step": 142 }, { "adv/mean_abs_final_conf": 0.4948643743991852, "adv/mean_abs_reasoning": 0.44098764657974243, "adv/mean_abs_step_conf": 0.7755706310272217, "adv/ratio_final_to_reasoning": 1.1221728731797942, "adv/ratio_step_to_reasoning": 1.7587128279952342, "adv/std_final_conf": 0.7587860226631165, "adv/std_reasoning": 0.7205314040184021, "adv/std_step_conf": 0.9349554181098938, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.7777046783625731, "calib/avg_num_step_conf": 8.8984375, "calib/ece": 0.2130962343096235, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.7196652719665272, "calib/gap": 0.4375840643274854, "calib/mean_conf": 0.7689121338912134, "calib/mu_c": 0.9428472222222223, "calib/mu_w": 0.5052631578947369, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18974895397489547, "calib/std_conf": 0.38111550185258836, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6166698564593303, "calib/step_q_c_n": 1045.0, "calib/step_q_gap": 0.2781864825744965, "calib/step_q_w": 0.33848337388483374, "calib/step_q_w_n": 1233.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2999.0, "completions/max_terminated_length": 2999.0, "completions/mean_length": 657.38671875, "completions/mean_terminated_length": 678.5927124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 210.0, "epoch": 0.15253333333333333, "grad_norm": 0.02659987099468708, "learning_rate": 1.5833333333333333e-06, "loss": -0.0362, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.02512693963944912, "mask/share_reasoning": 0.8076825737953186, "mask/share_step_conf": 0.13594046235084534, "num_tokens": 35775122.0, "reward": 1.3324463367462158, "reward_std": 0.2990143597126007, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7442284822463989, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.8107225298881531, "step": 143 }, { "adv/mean_abs_final_conf": 0.4784291386604309, "adv/mean_abs_reasoning": 0.3893376588821411, "adv/mean_abs_step_conf": 0.7565593719482422, "adv/ratio_final_to_reasoning": 1.228828313279757, "adv/ratio_step_to_reasoning": 1.9431959757513853, "adv/std_final_conf": 0.758713960647583, "adv/std_reasoning": 0.6614014506340027, "adv/std_step_conf": 0.9349938035011292, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7353927203065134, "calib/avg_num_step_conf": 7.359375, "calib/ece": 0.18304878048780482, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7439024390243902, "calib/gap": 0.3871599616858238, "calib/mean_conf": 0.7889837398373983, "calib/mu_c": 0.9022988505747126, "calib/mu_w": 0.5151388888888888, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.13235772357723571, "calib/std_conf": 0.3750308710055105, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5998940505297473, "calib/step_q_c_n": 1227.0, "calib/step_q_gap": 0.2120706106515129, "calib/step_q_w": 0.3878234398782344, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 601.13671875, "completions/mean_terminated_length": 613.111572265625, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.1536, "grad_norm": 0.0394146703183651, "learning_rate": 1.5555555555555558e-06, "loss": -0.1006, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.028188010677695274, "mask/share_reasoning": 0.8198911547660828, "mask/share_step_conf": 0.1323896050453186, "num_tokens": 36033141.0, "reward": 1.3731136322021484, "reward_std": 0.2826434373855591, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7666667699813843, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8268896341323853, "step": 144 }, { "adv/mean_abs_final_conf": 0.6184766292572021, "adv/mean_abs_reasoning": 0.5618990659713745, "adv/mean_abs_step_conf": 0.7520120143890381, "adv/ratio_final_to_reasoning": 1.1006899044902665, "adv/ratio_step_to_reasoning": 1.3383400328117803, "adv/std_final_conf": 0.8280069828033447, "adv/std_reasoning": 0.7929149866104126, "adv/std_step_conf": 0.9355904459953308, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6229311846689896, "calib/avg_num_step_conf": 8.15234375, "calib/ece": 0.2644354838709678, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8104838709677419, "calib/gap": 0.20515098722415792, "calib/mean_conf": 0.8543548387096774, "calib/mu_c": 0.9238414634146341, "calib/mu_w": 0.7186904761904762, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22875000000000006, "calib/std_conf": 0.3157919944629842, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6530969845150774, "calib/step_q_c_n": 1227.0, "calib/step_q_gap": 0.19349233335228666, "calib/step_q_w": 0.4596046511627907, "calib/step_q_w_n": 860.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 572.71875, "completions/mean_terminated_length": 584.1275024414062, "completions/min_length": 0.0, "completions/min_terminated_length": 207.0, "epoch": 0.15466666666666667, "grad_norm": 0.02873079851269722, "learning_rate": 1.527777777777778e-06, "loss": -0.1331, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.028295297175645828, "mask/share_reasoning": 0.8084123134613037, "mask/share_step_conf": 0.14376112818717957, "num_tokens": 36282461.0, "reward": 1.3241336345672607, "reward_std": 0.3417121171951294, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.708076536655426, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8091577291488647, "step": 145 }, { "adv/mean_abs_final_conf": 0.6596519947052002, "adv/mean_abs_reasoning": 0.5093032121658325, "adv/mean_abs_step_conf": 0.7680574059486389, "adv/ratio_final_to_reasoning": 1.2952048582218898, "adv/ratio_step_to_reasoning": 1.508055295159918, "adv/std_final_conf": 0.84257972240448, "adv/std_reasoning": 0.7754387855529785, "adv/std_step_conf": 0.9353065490722656, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7108323133414931, "calib/avg_num_step_conf": 7.49609375, "calib/ece": 0.30551440329218105, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7160493827160493, "calib/gap": 0.3522419420644636, "calib/mean_conf": 0.7636213991769547, "calib/mu_c": 0.9506140350877194, "calib/mu_w": 0.5983720930232558, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3, "calib/std_conf": 0.38390034912799154, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6353417721518988, "calib/step_q_c_n": 790.0, "calib/step_q_gap": 0.2182204258277181, "calib/step_q_w": 0.4171213463241807, "calib/step_q_w_n": 1129.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2622.0, "completions/max_terminated_length": 2622.0, "completions/mean_length": 607.8046875, "completions/mean_terminated_length": 632.5121459960938, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.15573333333333333, "grad_norm": 0.02385423704981804, "learning_rate": 1.5e-06, "loss": -0.1869, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.026042595505714417, "mask/share_reasoning": 0.8098737001419067, "mask/share_step_conf": 0.12502123415470123, "num_tokens": 36545275.0, "reward": 1.2390952110290527, "reward_std": 0.3953307271003723, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.6571453213691711, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7710694074630737, "step": 146 }, { "adv/mean_abs_final_conf": 0.49504554271698, "adv/mean_abs_reasoning": 0.36164918541908264, "adv/mean_abs_step_conf": 0.7685758471488953, "adv/ratio_final_to_reasoning": 1.3688556830103635, "adv/ratio_step_to_reasoning": 2.1251972301783617, "adv/std_final_conf": 0.7386893630027771, "adv/std_reasoning": 0.6612387895584106, "adv/std_step_conf": 0.9356215596199036, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6546900161030595, "calib/avg_num_step_conf": 7.9609375, "calib/ece": 0.36841463414634146, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7195121951219512, "calib/gap": 0.27287439613526576, "calib/mean_conf": 0.7658130081300812, "calib/mu_c": 0.9188888888888889, "calib/mu_w": 0.6460144927536231, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.34760162601626016, "calib/std_conf": 0.38750288432377356, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6030394431554523, "calib/step_q_c_n": 862.0, "calib/step_q_gap": 0.14825202818946592, "calib/step_q_w": 0.4547874149659864, "calib/step_q_w_n": 1176.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2981.0, "completions/max_terminated_length": 2981.0, "completions/mean_length": 656.359375, "completions/mean_terminated_length": 661.5275268554688, "completions/min_length": 0.0, "completions/min_terminated_length": 259.0, "epoch": 0.1568, "grad_norm": 0.025043372064828873, "learning_rate": 1.4722222222222225e-06, "loss": -0.0195, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.024738822132349014, "mask/share_reasoning": 0.836007833480835, "mask/share_step_conf": 0.13144081830978394, "num_tokens": 36816983.0, "reward": 1.1915602684020996, "reward_std": 0.2968427538871765, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.606521487236023, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7500181198120117, "step": 147 }, { "adv/mean_abs_final_conf": 0.5144132971763611, "adv/mean_abs_reasoning": 0.45546209812164307, "adv/mean_abs_step_conf": 0.7453041076660156, "adv/ratio_final_to_reasoning": 1.129431624053542, "adv/ratio_step_to_reasoning": 1.6363691089548413, "adv/std_final_conf": 0.7764376997947693, "adv/std_reasoning": 0.7394083142280579, "adv/std_step_conf": 0.9351122379302979, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.68747591522158, "calib/avg_num_step_conf": 7.41015625, "calib/ece": 0.20294354838709677, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7943548387096774, "calib/gap": 0.330892485549133, "calib/mean_conf": 0.8280241935483871, "calib/mu_c": 0.9280924855491329, "calib/mu_w": 0.5972, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1666935483870968, "calib/std_conf": 0.34947040494268067, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6147808764940239, "calib/step_q_c_n": 1255.0, "calib/step_q_gap": 0.18976530017003634, "calib/step_q_w": 0.42501557632398757, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 575.42578125, "completions/mean_terminated_length": 589.2360229492188, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.15786666666666666, "grad_norm": 0.024046506732702255, "learning_rate": 1.4444444444444445e-06, "loss": -0.1088, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.029452763497829437, "mask/share_reasoning": 0.8054896593093872, "mask/share_step_conf": 0.14162006974220276, "num_tokens": 37069404.0, "reward": 1.372555136680603, "reward_std": 0.3066759705543518, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7648323774337769, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8256858587265015, "step": 148 }, { "adv/mean_abs_final_conf": 0.5331004858016968, "adv/mean_abs_reasoning": 0.5124787092208862, "adv/mean_abs_step_conf": 0.7494461536407471, "adv/ratio_final_to_reasoning": 1.0402392845005435, "adv/ratio_step_to_reasoning": 1.46239471056294, "adv/std_final_conf": 0.7765871286392212, "adv/std_reasoning": 0.7575570940971375, "adv/std_step_conf": 0.9348960518836975, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7253521126760563, "calib/avg_num_step_conf": 7.828125, "calib/ece": 0.25684426229508195, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.8073770491803278, "calib/gap": 0.3722328086164043, "calib/mean_conf": 0.8304508196721311, "calib/mu_c": 0.986056338028169, "calib/mu_w": 0.6138235294117648, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2526639344262295, "calib/std_conf": 0.3521343028638739, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6404268292682928, "calib/step_q_c_n": 984.0, "calib/step_q_gap": 0.25766212338593986, "calib/step_q_w": 0.3827647058823529, "calib/step_q_w_n": 1020.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2420.0, "completions/max_terminated_length": 2420.0, "completions/mean_length": 620.30078125, "completions/mean_terminated_length": 635.1880493164062, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.15893333333333334, "grad_norm": 0.028948621824383736, "learning_rate": 1.4166666666666667e-06, "loss": -0.0542, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.026255900040268898, "mask/share_reasoning": 0.8194175958633423, "mask/share_step_conf": 0.13088896870613098, "num_tokens": 37332657.0, "reward": 1.3208997249603271, "reward_std": 0.3231900632381439, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7129691243171692, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8140244483947754, "step": 149 }, { "adv/mean_abs_final_conf": 0.5323728919029236, "adv/mean_abs_reasoning": 0.4642510414123535, "adv/mean_abs_step_conf": 0.7564442157745361, "adv/ratio_final_to_reasoning": 1.1467349438427288, "adv/ratio_step_to_reasoning": 1.6293861473595554, "adv/std_final_conf": 0.7764654755592346, "adv/std_reasoning": 0.7205824255943298, "adv/std_step_conf": 0.9355114698410034, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6243245216883306, "calib/avg_num_step_conf": 7.39453125, "calib/ece": 0.26751004016064256, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8313253012048193, "calib/gap": 0.17992697531765733, "calib/mean_conf": 0.8614056224899599, "calib/mu_c": 0.9206586826347305, "calib/mu_w": 0.7407317073170732, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2291164658634538, "calib/std_conf": 0.3127042820309887, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5925986277873071, "calib/step_q_c_n": 1166.0, "calib/step_q_gap": 0.1243042674021626, "calib/step_q_w": 0.4682943603851445, "calib/step_q_w_n": 727.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2356.0, "completions/max_terminated_length": 2356.0, "completions/mean_length": 556.68359375, "completions/mean_terminated_length": 565.5198974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.16, "grad_norm": 0.031456056982278824, "learning_rate": 1.3888888888888892e-06, "loss": -0.017, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.030916960909962654, "mask/share_reasoning": 0.8093185424804688, "mask/share_step_conf": 0.14413949847221375, "num_tokens": 37580128.0, "reward": 1.3363224267959595, "reward_std": 0.2611082196235657, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7043378949165344, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8220440745353699, "step": 150 }, { "adv/mean_abs_final_conf": 0.5166769027709961, "adv/mean_abs_reasoning": 0.4023810029029846, "adv/mean_abs_step_conf": 0.7550654411315918, "adv/ratio_final_to_reasoning": 1.2840489462559657, "adv/ratio_step_to_reasoning": 1.8764937601033829, "adv/std_final_conf": 0.7582067847251892, "adv/std_reasoning": 0.6817334294319153, "adv/std_step_conf": 0.9352085590362549, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6951489533011272, "calib/avg_num_step_conf": 8.140625, "calib/ece": 0.2702845528455285, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.7276422764227642, "calib/gap": 0.3219202898550724, "calib/mean_conf": 0.778089430894309, "calib/mu_c": 0.9194202898550724, "calib/mu_w": 0.5975, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24369918699186996, "calib/std_conf": 0.37643277524484753, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6179471544715448, "calib/step_q_c_n": 984.0, "calib/step_q_gap": 0.24471988174427206, "calib/step_q_w": 0.37322727272727274, "calib/step_q_w_n": 1100.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 636.46484375, "completions/mean_terminated_length": 649.1434326171875, "completions/min_length": 0.0, "completions/min_terminated_length": 216.0, "epoch": 0.16106666666666666, "grad_norm": 0.027021048590540886, "learning_rate": 1.3611111111111112e-06, "loss": -0.0869, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.02588428556919098, "mask/share_reasoning": 0.8255480527877808, "mask/share_step_conf": 0.12903639674186707, "num_tokens": 37850087.0, "reward": 1.2872600555419922, "reward_std": 0.30778902769088745, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6913784742355347, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7919613122940063, "step": 151 }, { "adv/mean_abs_final_conf": 0.6841462850570679, "adv/mean_abs_reasoning": 0.6128799915313721, "adv/mean_abs_step_conf": 0.7404754161834717, "adv/ratio_final_to_reasoning": 1.1162809922177852, "adv/ratio_step_to_reasoning": 1.2081899008210129, "adv/std_final_conf": 0.8762879371643066, "adv/std_reasoning": 0.8267136812210083, "adv/std_step_conf": 0.9357709884643555, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6138136288998359, "calib/avg_num_step_conf": 8.19921875, "calib/ece": 0.38019876033057853, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.756198347107438, "calib/gap": 0.18923923097974815, "calib/mean_conf": 0.7949252066115703, "calib/mu_c": 0.8856349206349207, "calib/mu_w": 0.6963956896551725, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.32723140495867775, "calib/std_conf": 0.370573986635919, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.5821627408993576, "calib/step_q_c_n": 934.0, "calib/step_q_gap": 0.15284085248734047, "calib/step_q_w": 0.4293218884120171, "calib/step_q_w_n": 1165.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2792.0, "completions/max_terminated_length": 2792.0, "completions/mean_length": 629.27734375, "completions/mean_terminated_length": 636.7391357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.16213333333333332, "grad_norm": 0.027300406247377396, "learning_rate": 1.3333333333333334e-06, "loss": -0.1419, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.026354603469371796, "mask/share_reasoning": 0.8259069919586182, "mask/share_step_conf": 0.13601967692375183, "num_tokens": 38116574.0, "reward": 1.1926360130310059, "reward_std": 0.40411001443862915, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5822392702102661, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7593287229537964, "step": 152 }, { "adv/mean_abs_final_conf": 0.49810728430747986, "adv/mean_abs_reasoning": 0.4407353401184082, "adv/mean_abs_step_conf": 0.7415911555290222, "adv/ratio_final_to_reasoning": 1.1301732331554308, "adv/ratio_step_to_reasoning": 1.6826223994876062, "adv/std_final_conf": 0.762622594833374, "adv/std_reasoning": 0.7393104434013367, "adv/std_step_conf": 0.9351751804351807, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.582449160035367, "calib/avg_num_step_conf": 7.80859375, "calib/ece": 0.32271604938271603, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7860082304526749, "calib/gap": 0.11059681697612733, "calib/mean_conf": 0.8395061728395061, "calib/mu_c": 0.8791025641025642, "calib/mu_w": 0.7685057471264368, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.26012345679012344, "calib/std_conf": 0.3278006315996392, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5794348659003832, "calib/step_q_c_n": 1044.0, "calib/step_q_gap": 0.18048198631923135, "calib/step_q_w": 0.39895287958115183, "calib/step_q_w_n": 955.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2868.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 606.0078125, "completions/mean_terminated_length": 623.0441284179688, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.1632, "grad_norm": 0.035739362239837646, "learning_rate": 1.3055555555555556e-06, "loss": -0.1231, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.026105670258402824, "mask/share_reasoning": 0.8216968774795532, "mask/share_step_conf": 0.12485368549823761, "num_tokens": 38379032.0, "reward": 1.2565110921859741, "reward_std": 0.2758803963661194, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6402719020843506, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.780515730381012, "step": 153 }, { "adv/mean_abs_final_conf": 0.4910415709018707, "adv/mean_abs_reasoning": 0.41510850191116333, "adv/mean_abs_step_conf": 0.7597833871841431, "adv/ratio_final_to_reasoning": 1.1829234251794671, "adv/ratio_step_to_reasoning": 1.8303248034817245, "adv/std_final_conf": 0.7406032085418701, "adv/std_reasoning": 0.7013994455337524, "adv/std_step_conf": 0.9353368878364563, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.609903381642512, "calib/avg_num_step_conf": 7.10546875, "calib/ece": 0.36212851405622504, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8032128514056225, "calib/gap": 0.17549353701527615, "calib/mean_conf": 0.8347389558232933, "calib/mu_c": 0.9129710144927536, "calib/mu_w": 0.7374774774774775, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3213253012048194, "calib/std_conf": 0.3449696339126116, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5856728778467909, "calib/step_q_c_n": 966.0, "calib/step_q_gap": 0.1229765120789128, "calib/step_q_w": 0.4626963657678781, "calib/step_q_w_n": 853.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2499.0, "completions/max_terminated_length": 2499.0, "completions/mean_length": 578.46484375, "completions/mean_terminated_length": 583.0196533203125, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.16426666666666667, "grad_norm": 0.026899857446551323, "learning_rate": 1.2777777777777779e-06, "loss": -0.0082, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.028808027505874634, "mask/share_reasoning": 0.829174280166626, "mask/share_step_conf": 0.134205162525177, "num_tokens": 38631559.0, "reward": 1.2400684356689453, "reward_std": 0.2998583912849426, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6244043111801147, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7766944169998169, "step": 154 }, { "adv/mean_abs_final_conf": 0.5528605580329895, "adv/mean_abs_reasoning": 0.5002375841140747, "adv/mean_abs_step_conf": 0.7550753355026245, "adv/ratio_final_to_reasoning": 1.1051959620589296, "adv/ratio_step_to_reasoning": 1.5094334361938633, "adv/std_final_conf": 0.7846831679344177, "adv/std_reasoning": 0.775325357913971, "adv/std_step_conf": 0.9355731010437012, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6421815889029003, "calib/avg_num_step_conf": 7.59765625, "calib/ece": 0.31912698412698415, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6468253968253969, "calib/gap": 0.23936696090794463, "calib/mean_conf": 0.6987301587301586, "calib/mu_c": 0.8222131147540984, "calib/mu_w": 0.5828461538461538, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.26686507936507936, "calib/std_conf": 0.41400575896420855, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.539650110375276, "calib/step_q_c_n": 906.0, "calib/step_q_gap": 0.11493403722801904, "calib/step_q_w": 0.42471607314725696, "calib/step_q_w_n": 1039.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 585.1953125, "completions/mean_terminated_length": 587.490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 205.0, "epoch": 0.16533333333333333, "grad_norm": 0.027479618787765503, "learning_rate": 1.25e-06, "loss": -0.0124, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.027545053511857986, "mask/share_reasoning": 0.8279811143875122, "mask/share_step_conf": 0.14056754112243652, "num_tokens": 38888585.0, "reward": 1.2375892400741577, "reward_std": 0.2888715863227844, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.6382601261138916, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7727559804916382, "step": 155 }, { "adv/mean_abs_final_conf": 0.5545140504837036, "adv/mean_abs_reasoning": 0.4322436451911926, "adv/mean_abs_step_conf": 0.7410604953765869, "adv/ratio_final_to_reasoning": 1.282873806596804, "adv/ratio_step_to_reasoning": 1.7144508742258004, "adv/std_final_conf": 0.7995932698249817, "adv/std_reasoning": 0.7205738425254822, "adv/std_step_conf": 0.9348962306976318, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6980792724311116, "calib/avg_num_step_conf": 7.79296875, "calib/ece": 0.24755102040816335, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6816326530612244, "calib/gap": 0.3311680466947198, "calib/mean_conf": 0.7417551020408162, "calib/mu_c": 0.8850359712230216, "calib/mu_w": 0.5538679245283018, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.21097959183673476, "calib/std_conf": 0.3934833726026877, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5458590733590734, "calib/step_q_c_n": 1036.0, "calib/step_q_gap": 0.168372107769918, "calib/step_q_w": 0.37748696558915534, "calib/step_q_w_n": 959.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3048.0, "completions/max_terminated_length": 3048.0, "completions/mean_length": 598.31640625, "completions/mean_terminated_length": 607.8135375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.1664, "grad_norm": 0.03416966274380684, "learning_rate": 1.2222222222222223e-06, "loss": -0.0556, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.028070878237485886, "mask/share_reasoning": 0.8195517659187317, "mask/share_step_conf": 0.13675233721733093, "num_tokens": 39146514.0, "reward": 1.2828004360198975, "reward_std": 0.26979345083236694, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6931429505348206, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.787010133266449, "step": 156 }, { "adv/mean_abs_final_conf": 0.5185093879699707, "adv/mean_abs_reasoning": 0.44146716594696045, "adv/mean_abs_step_conf": 0.7368483543395996, "adv/ratio_final_to_reasoning": 1.1745140476251554, "adv/ratio_step_to_reasoning": 1.6690898240620853, "adv/std_final_conf": 0.7898529171943665, "adv/std_reasoning": 0.7574617862701416, "adv/std_step_conf": 0.9352151155471802, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7419051204819278, "calib/avg_num_step_conf": 8.0703125, "calib/ece": 0.19674796747967488, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7479674796747967, "calib/gap": 0.3956746987951808, "calib/mean_conf": 0.79, "calib/mu_c": 0.9186746987951807, "calib/mu_w": 0.5229999999999999, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15597560975609762, "calib/std_conf": 0.3765472146730723, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5667429837518465, "calib/step_q_c_n": 1354.0, "calib/step_q_gap": 0.1697626466731948, "calib/step_q_w": 0.3969803370786517, "calib/step_q_w_n": 712.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2964.0, "completions/max_terminated_length": 2964.0, "completions/mean_length": 597.6796875, "completions/mean_terminated_length": 607.1666870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.16746666666666668, "grad_norm": 0.041828371584415436, "learning_rate": 1.1944444444444446e-06, "loss": -0.017, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.02795276790857315, "mask/share_reasoning": 0.8079496622085571, "mask/share_step_conf": 0.1484726071357727, "num_tokens": 39403248.0, "reward": 1.3735963106155396, "reward_std": 0.2610359191894531, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7679358720779419, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8279096484184265, "step": 157 }, { "adv/mean_abs_final_conf": 0.5031821727752686, "adv/mean_abs_reasoning": 0.42407286167144775, "adv/mean_abs_step_conf": 0.7414059638977051, "adv/ratio_final_to_reasoning": 1.1865465071073353, "adv/ratio_step_to_reasoning": 1.7482985376039284, "adv/std_final_conf": 0.7528063058853149, "adv/std_reasoning": 0.7013627290725708, "adv/std_step_conf": 0.9352811574935913, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6027310363247863, "calib/avg_num_step_conf": 6.875, "calib/ece": 0.3140873015873016, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8134920634920635, "calib/gap": 0.14328525641025647, "calib/mean_conf": 0.8659920634920636, "calib/mu_c": 0.9205769230769232, "calib/mu_w": 0.7772916666666667, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.280515873015873, "calib/std_conf": 0.29557619263279233, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6128088803088804, "calib/step_q_c_n": 1036.0, "calib/step_q_gap": 0.14988070351330024, "calib/step_q_w": 0.4629281767955801, "calib/step_q_w_n": 724.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1946.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 561.27734375, "completions/mean_terminated_length": 567.9328002929688, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.16853333333333334, "grad_norm": 0.03425997868180275, "learning_rate": 1.1666666666666668e-06, "loss": -0.0805, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030332986265420914, "mask/share_reasoning": 0.8240467309951782, "mask/share_step_conf": 0.13390159606933594, "num_tokens": 39652175.0, "reward": 1.3016924858093262, "reward_std": 0.2724427580833435, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6727285385131836, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8059532046318054, "step": 158 }, { "adv/mean_abs_final_conf": 0.4782908856868744, "adv/mean_abs_reasoning": 0.4805372357368469, "adv/mean_abs_step_conf": 0.739375650882721, "adv/ratio_final_to_reasoning": 0.9953253361385658, "adv/ratio_step_to_reasoning": 1.5386438259024318, "adv/std_final_conf": 0.7405784726142883, "adv/std_reasoning": 0.7393211722373962, "adv/std_step_conf": 0.9332762360572815, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6278726544381193, "calib/avg_num_step_conf": 7.671875, "calib/ece": 0.29825203252032517, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7032520325203252, "calib/gap": 0.18947712418300655, "calib/mean_conf": 0.7711788617886178, "calib/mu_c": 0.8428104575163398, "calib/mu_w": 0.6533333333333332, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22373983739837394, "calib/std_conf": 0.3682288863246713, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.578344562078922, "calib/step_q_c_n": 1039.0, "calib/step_q_gap": 0.18837699451135453, "calib/step_q_w": 0.3899675675675675, "calib/step_q_w_n": 925.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1482.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 534.94921875, "completions/mean_terminated_length": 552.2056274414062, "completions/min_length": 0.0, "completions/min_terminated_length": 208.0, "epoch": 0.1696, "grad_norm": 0.029841555282473564, "learning_rate": 1.138888888888889e-06, "loss": -0.1624, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.02883203700184822, "mask/share_reasoning": 0.8057365417480469, "mask/share_step_conf": 0.134181410074234, "num_tokens": 39893906.0, "reward": 1.3323006629943848, "reward_std": 0.2667738199234009, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6689214706420898, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.841980516910553, "step": 159 }, { "adv/mean_abs_final_conf": 0.5557472705841064, "adv/mean_abs_reasoning": 0.5128598213195801, "adv/mean_abs_step_conf": 0.7253711223602295, "adv/ratio_final_to_reasoning": 1.083624116145768, "adv/ratio_step_to_reasoning": 1.4143652752790445, "adv/std_final_conf": 0.7941585779190063, "adv/std_reasoning": 0.7755072116851807, "adv/std_step_conf": 0.9360411763191223, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6706115107913669, "calib/avg_num_step_conf": 8.74609375, "calib/ece": 0.26849372384937237, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.6736401673640168, "calib/gap": 0.2852575539568346, "calib/mean_conf": 0.738702928870293, "calib/mu_c": 0.8580575539568346, "calib/mu_w": 0.5728, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.21280334728033473, "calib/std_conf": 0.395331003243248, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.591060762100927, "calib/step_q_c_n": 971.0, "calib/step_q_gap": 0.26627369585487015, "calib/step_q_w": 0.3247870662460568, "calib/step_q_w_n": 1268.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 600.89453125, "completions/mean_terminated_length": 630.4467163085938, "completions/min_length": 0.0, "completions/min_terminated_length": 202.0, "epoch": 0.17066666666666666, "grad_norm": 0.02899031713604927, "learning_rate": 1.111111111111111e-06, "loss": -0.2876, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.02542033977806568, "mask/share_reasoning": 0.8041006922721863, "mask/share_step_conf": 0.1236039474606514, "num_tokens": 40152575.0, "reward": 1.2626303434371948, "reward_std": 0.3472943902015686, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6632011532783508, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.783764123916626, "step": 160 }, { "adv/mean_abs_final_conf": 0.502620279788971, "adv/mean_abs_reasoning": 0.34476661682128906, "adv/mean_abs_step_conf": 0.7360956072807312, "adv/ratio_final_to_reasoning": 1.4578565767854081, "adv/ratio_step_to_reasoning": 2.1350547627477776, "adv/std_final_conf": 0.775112509727478, "adv/std_reasoning": 0.6613057851791382, "adv/std_step_conf": 0.9351096153259277, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7037220843672455, "calib/avg_num_step_conf": 7.28515625, "calib/ece": 0.20486055776892437, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6812749003984063, "calib/gap": 0.3267311827956989, "calib/mean_conf": 0.7501195219123507, "calib/mu_c": 0.8347311827956989, "calib/mu_w": 0.508, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10697211155378496, "calib/std_conf": 0.3778809463559852, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5683183856502243, "calib/step_q_c_n": 1338.0, "calib/step_q_gap": 0.14742654504301367, "calib/step_q_w": 0.42089184060721063, "calib/step_q_w_n": 527.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2242.0, "completions/max_terminated_length": 2242.0, "completions/mean_length": 548.92578125, "completions/mean_terminated_length": 557.638916015625, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.17173333333333332, "grad_norm": 0.03485341742634773, "learning_rate": 1.0833333333333335e-06, "loss": -0.0284, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03024507313966751, "mask/share_reasoning": 0.8108487129211426, "mask/share_step_conf": 0.14328116178512573, "num_tokens": 40397020.0, "reward": 1.3916150331497192, "reward_std": 0.22925958037376404, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.775180459022522, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8333216309547424, "step": 161 }, { "adv/mean_abs_final_conf": 0.5494496822357178, "adv/mean_abs_reasoning": 0.3901814818382263, "adv/mean_abs_step_conf": 0.7636798620223999, "adv/ratio_final_to_reasoning": 1.4081900546564787, "adv/ratio_step_to_reasoning": 1.9572427128641392, "adv/std_final_conf": 0.7924970984458923, "adv/std_reasoning": 0.6614232063293457, "adv/std_step_conf": 0.9354234933853149, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7297174487772637, "calib/avg_num_step_conf": 7.3046875, "calib/ece": 0.1895528455284553, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.7154471544715447, "calib/gap": 0.35837904824851274, "calib/mean_conf": 0.7747560975609756, "calib/mu_c": 0.873820224719101, "calib/mu_w": 0.5154411764705883, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.12036585365853661, "calib/std_conf": 0.3684549997666794, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5647347266881029, "calib/step_q_c_n": 1244.0, "calib/step_q_gap": 0.21722673946765558, "calib/step_q_w": 0.3475079872204473, "calib/step_q_w_n": 626.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1621.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 526.6171875, "completions/mean_terminated_length": 537.1076049804688, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.1728, "grad_norm": 0.031547948718070984, "learning_rate": 1.0555555555555557e-06, "loss": -0.0952, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.030170854181051254, "mask/share_reasoning": 0.8118693828582764, "mask/share_step_conf": 0.13842850923538208, "num_tokens": 40635978.0, "reward": 1.3593121767044067, "reward_std": 0.28900277614593506, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.759925365447998, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8145056962966919, "step": 162 }, { "adv/mean_abs_final_conf": 0.5519617199897766, "adv/mean_abs_reasoning": 0.4598156213760376, "adv/mean_abs_step_conf": 0.7201074361801147, "adv/ratio_final_to_reasoning": 1.20039792980061, "adv/ratio_step_to_reasoning": 1.566078668717543, "adv/std_final_conf": 0.8111170530319214, "adv/std_reasoning": 0.7574796676635742, "adv/std_step_conf": 0.9355893731117249, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7833166499833166, "calib/avg_num_step_conf": 7.70703125, "calib/ece": 0.19756097560975622, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6544715447154471, "calib/gap": 0.44075275275275283, "calib/mean_conf": 0.7343089430894308, "calib/mu_c": 0.9331851851851852, "calib/mu_w": 0.4924324324324324, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1915447154471546, "calib/std_conf": 0.3867271076271738, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6137882352941176, "calib/step_q_c_n": 850.0, "calib/step_q_gap": 0.2832005238070295, "calib/step_q_w": 0.33058771148708815, "calib/step_q_w_n": 1123.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2150.0, "completions/max_terminated_length": 2150.0, "completions/mean_length": 607.08984375, "completions/mean_terminated_length": 619.1832885742188, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.17386666666666667, "grad_norm": 0.030279580503702164, "learning_rate": 1.0277777777777777e-06, "loss": -0.1582, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.028492219746112823, "mask/share_reasoning": 0.8214021921157837, "mask/share_step_conf": 0.1305743008852005, "num_tokens": 40896225.0, "reward": 1.3520934581756592, "reward_std": 0.3107609152793884, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7559499740600586, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.825290322303772, "step": 163 }, { "adv/mean_abs_final_conf": 0.6321254968643188, "adv/mean_abs_reasoning": 0.48479893803596497, "adv/mean_abs_step_conf": 0.7586383819580078, "adv/ratio_final_to_reasoning": 1.3038920824067985, "adv/ratio_step_to_reasoning": 1.5648515754416277, "adv/std_final_conf": 0.8424050807952881, "adv/std_reasoning": 0.739427924156189, "adv/std_step_conf": 0.9356386661529541, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7565453813104188, "calib/avg_num_step_conf": 8.234375, "calib/ece": 0.2139183673469388, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6285714285714286, "calib/gap": 0.4061983082706766, "calib/mean_conf": 0.7064897959183672, "calib/mu_c": 0.8921804511278195, "calib/mu_w": 0.4859821428571429, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18877551020408165, "calib/std_conf": 0.3947325688567956, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5480887011615628, "calib/step_q_c_n": 947.0, "calib/step_q_gap": 0.21559688376276864, "calib/step_q_w": 0.3324918173987942, "calib/step_q_w_n": 1161.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 679.82421875, "completions/mean_terminated_length": 687.8853759765625, "completions/min_length": 0.0, "completions/min_terminated_length": 226.0, "epoch": 0.17493333333333333, "grad_norm": 0.026235386729240417, "learning_rate": 1.0000000000000002e-06, "loss": -0.0761, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.02434472180902958, "mask/share_reasoning": 0.8386214375495911, "mask/share_step_conf": 0.1253151148557663, "num_tokens": 41176396.0, "reward": 1.3394906520843506, "reward_std": 0.29882270097732544, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7377316355705261, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8229684829711914, "step": 164 }, { "adv/mean_abs_final_conf": 0.5562580823898315, "adv/mean_abs_reasoning": 0.43912598490715027, "adv/mean_abs_step_conf": 0.7661980390548706, "adv/ratio_final_to_reasoning": 1.2667391625832571, "adv/ratio_step_to_reasoning": 1.7448250966448209, "adv/std_final_conf": 0.7766196727752686, "adv/std_reasoning": 0.7014268040657043, "adv/std_step_conf": 0.9359138011932373, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7032210553278688, "calib/avg_num_step_conf": 7.41796875, "calib/ece": 0.30076, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.704, "calib/gap": 0.3004316086065574, "calib/mean_conf": 0.76628, "calib/mu_c": 0.912890625, "calib/mu_w": 0.6124590163934426, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.27752, "calib/std_conf": 0.37215502361247255, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5824116161616162, "calib/step_q_c_n": 792.0, "calib/step_q_gap": 0.18214061345158905, "calib/step_q_w": 0.40027100271002714, "calib/step_q_w_n": 1107.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2981.0, "completions/max_terminated_length": 2981.0, "completions/mean_length": 643.3359375, "completions/mean_terminated_length": 648.4015502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.176, "grad_norm": 0.03587152808904648, "learning_rate": 9.722222222222224e-07, "loss": -0.0357, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.026652254164218903, "mask/share_reasoning": 0.8428949117660522, "mask/share_step_conf": 0.12264031916856766, "num_tokens": 41446666.0, "reward": 1.278690218925476, "reward_std": 0.31332042813301086, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6767804622650146, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7934250235557556, "step": 165 }, { "adv/mean_abs_final_conf": 0.49421077966690063, "adv/mean_abs_reasoning": 0.3963538110256195, "adv/mean_abs_step_conf": 0.7633610963821411, "adv/ratio_final_to_reasoning": 1.2468929676443956, "adv/ratio_step_to_reasoning": 1.9259587649904015, "adv/std_final_conf": 0.7540462017059326, "adv/std_reasoning": 0.6614139676094055, "adv/std_step_conf": 0.9339258670806885, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8318401937046005, "calib/avg_num_step_conf": 7.5546875, "calib/ece": 0.12331983805668015, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6558704453441295, "calib/gap": 0.5746246973365617, "calib/mean_conf": 0.7174898785425101, "calib/mu_c": 0.8803389830508475, "calib/mu_w": 0.3057142857142857, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.06210526315789472, "calib/std_conf": 0.40888534035530694, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.526060606060606, "calib/step_q_c_n": 1287.0, "calib/step_q_gap": 0.24952892136199706, "calib/step_q_w": 0.27653168469860895, "calib/step_q_w_n": 647.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2197.0, "completions/max_terminated_length": 2197.0, "completions/mean_length": 613.37109375, "completions/mean_terminated_length": 620.644287109375, "completions/min_length": 0.0, "completions/min_terminated_length": 233.0, "epoch": 0.17706666666666668, "grad_norm": 0.033427972346544266, "learning_rate": 9.444444444444445e-07, "loss": -0.0804, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.026531532406806946, "mask/share_reasoning": 0.8271982669830322, "mask/share_step_conf": 0.13455137610435486, "num_tokens": 41709873.0, "reward": 1.4548362493515015, "reward_std": 0.2148941308259964, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.8294359445571899, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8748839497566223, "step": 166 }, { "adv/mean_abs_final_conf": 0.49704134464263916, "adv/mean_abs_reasoning": 0.4293529689311981, "adv/mean_abs_step_conf": 0.7540600895881653, "adv/ratio_final_to_reasoning": 1.1576520499670466, "adv/ratio_step_to_reasoning": 1.7562708171443902, "adv/std_final_conf": 0.7590051293373108, "adv/std_reasoning": 0.7013872265815735, "adv/std_step_conf": 0.9354445934295654, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6131402482522471, "calib/avg_num_step_conf": 7.32421875, "calib/ece": 0.2698393574297187, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7831325301204819, "calib/gap": 0.18734626908260799, "calib/mean_conf": 0.8412449799196787, "calib/mu_c": 0.9059509202453987, "calib/mu_w": 0.7186046511627907, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.22823293172690748, "calib/std_conf": 0.31642908476884707, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5313021702838063, "calib/step_q_c_n": 1198.0, "calib/step_q_gap": 0.03786051592634693, "calib/step_q_w": 0.4934416543574594, "calib/step_q_w_n": 677.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3066.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 578.46484375, "completions/mean_terminated_length": 585.3241577148438, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.17813333333333334, "grad_norm": 0.041853420436382294, "learning_rate": 9.166666666666666e-07, "loss": -0.0108, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.028374582529067993, "mask/share_reasoning": 0.8254715800285339, "mask/share_step_conf": 0.13443510234355927, "num_tokens": 41963568.0, "reward": 1.3114798069000244, "reward_std": 0.2623671293258667, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7038777470588684, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7986034154891968, "step": 167 }, { "adv/mean_abs_final_conf": 0.5935930609703064, "adv/mean_abs_reasoning": 0.4941191077232361, "adv/mean_abs_step_conf": 0.7486543655395508, "adv/ratio_final_to_reasoning": 1.2013157388416302, "adv/ratio_step_to_reasoning": 1.5151293561367067, "adv/std_final_conf": 0.8258097767829895, "adv/std_reasoning": 0.7576125264167786, "adv/std_step_conf": 0.9353750348091125, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7621713895527483, "calib/avg_num_step_conf": 7.65625, "calib/ece": 0.20207317073170739, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6991869918699187, "calib/gap": 0.36288221235916696, "calib/mean_conf": 0.7828048780487804, "calib/mu_c": 0.9317931034482759, "calib/mu_w": 0.568910891089109, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19772357723577244, "calib/std_conf": 0.35477328518173923, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5597534833869239, "calib/step_q_c_n": 933.0, "calib/step_q_gap": 0.19168142885917316, "calib/step_q_w": 0.3680720545277507, "calib/step_q_w_n": 1027.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2855.0, "completions/max_terminated_length": 2855.0, "completions/mean_length": 629.69140625, "completions/mean_terminated_length": 642.235107421875, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.1792, "grad_norm": 0.02765379659831524, "learning_rate": 8.88888888888889e-07, "loss": -0.0696, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.02691633626818657, "mask/share_reasoning": 0.8299218416213989, "mask/share_step_conf": 0.1236305758357048, "num_tokens": 42229441.0, "reward": 1.343550682067871, "reward_std": 0.3048199713230133, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7402839660644531, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8206743001937866, "step": 168 }, { "adv/mean_abs_final_conf": 0.5480157136917114, "adv/mean_abs_reasoning": 0.3762734830379486, "adv/mean_abs_step_conf": 0.7599032521247864, "adv/ratio_final_to_reasoning": 1.4564292685924987, "adv/ratio_step_to_reasoning": 2.0195503706227083, "adv/std_final_conf": 0.7893987894058228, "adv/std_reasoning": 0.6403399109840393, "adv/std_step_conf": 0.9350810647010803, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.696174018289403, "calib/avg_num_step_conf": 6.55078125, "calib/ece": 0.28497975708502027, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7368421052631579, "calib/gap": 0.2515384615384615, "calib/mean_conf": 0.821012145748988, "calib/mu_c": 0.9269230769230768, "calib/mu_w": 0.6753846153846154, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.26352226720647776, "calib/std_conf": 0.31959518272987264, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5789989118607182, "calib/step_q_c_n": 919.0, "calib/step_q_gap": 0.1252390174016152, "calib/step_q_w": 0.453759894459103, "calib/step_q_w_n": 758.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2842.0, "completions/max_terminated_length": 2842.0, "completions/mean_length": 606.80078125, "completions/mean_terminated_length": 613.99609375, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.18026666666666666, "grad_norm": 0.04610948637127876, "learning_rate": 8.611111111111112e-07, "loss": -0.0505, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.028334004804491997, "mask/share_reasoning": 0.8365331888198853, "mask/share_step_conf": 0.1234140545129776, "num_tokens": 42488966.0, "reward": 1.3015222549438477, "reward_std": 0.2599875032901764, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6834961175918579, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8086023330688477, "step": 169 }, { "adv/mean_abs_final_conf": 0.5640392303466797, "adv/mean_abs_reasoning": 0.4219498932361603, "adv/mean_abs_step_conf": 0.7920935153961182, "adv/ratio_final_to_reasoning": 1.3367445741501673, "adv/ratio_step_to_reasoning": 1.8772217462152383, "adv/std_final_conf": 0.7761444449424744, "adv/std_reasoning": 0.6817612648010254, "adv/std_step_conf": 0.9347199201583862, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6794758938244854, "calib/avg_num_step_conf": 6.97265625, "calib/ece": 0.28219512195121943, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7723577235772358, "calib/gap": 0.26059452871072586, "calib/mean_conf": 0.8260975609756097, "calib/mu_c": 0.9362676056338027, "calib/mu_w": 0.6756730769230769, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.26552845528455277, "calib/std_conf": 0.32923077172549003, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.5710483234714004, "calib/step_q_c_n": 1014.0, "calib/step_q_gap": 0.09902497716789849, "calib/step_q_w": 0.47202334630350196, "calib/step_q_w_n": 771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 602.64453125, "completions/mean_terminated_length": 605.0078735351562, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.18133333333333335, "grad_norm": 0.032945066690444946, "learning_rate": 8.333333333333333e-07, "loss": -0.0215, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.02677723951637745, "mask/share_reasoning": 0.8398348689079285, "mask/share_step_conf": 0.12948164343833923, "num_tokens": 42747395.0, "reward": 1.2866040468215942, "reward_std": 0.32099831104278564, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6776843667030334, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7969805002212524, "step": 170 }, { "adv/mean_abs_final_conf": 0.6232767105102539, "adv/mean_abs_reasoning": 0.48395657539367676, "adv/mean_abs_step_conf": 0.7597352266311646, "adv/ratio_final_to_reasoning": 1.2878773472666356, "adv/ratio_step_to_reasoning": 1.5698417280789176, "adv/std_final_conf": 0.8432072401046753, "adv/std_reasoning": 0.7394060492515564, "adv/std_step_conf": 0.9357650876045227, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6812934540207267, "calib/avg_num_step_conf": 7.4765625, "calib/ece": 0.305748987854251, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6720647773279352, "calib/gap": 0.24974419519874058, "calib/mean_conf": 0.7538461538461538, "calib/mu_c": 0.8761904761904762, "calib/mu_w": 0.6264462809917356, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2747368421052631, "calib/std_conf": 0.37176048051651633, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5969871043376319, "calib/step_q_c_n": 853.0, "calib/step_q_gap": 0.21684572827731147, "calib/step_q_w": 0.38014137606032045, "calib/step_q_w_n": 1061.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3026.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 604.0, "completions/mean_terminated_length": 611.162109375, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.1824, "grad_norm": 0.045893196016550064, "learning_rate": 8.055555555555557e-07, "loss": -0.0116, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.028350790962576866, "mask/share_reasoning": 0.8282653093338013, "mask/share_step_conf": 0.13166514039039612, "num_tokens": 43008915.0, "reward": 1.2718992233276367, "reward_std": 0.3006881773471832, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6478304862976074, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8030619621276855, "step": 171 }, { "adv/mean_abs_final_conf": 0.4694840908050537, "adv/mean_abs_reasoning": 0.4077759385108948, "adv/mean_abs_step_conf": 0.7584755420684814, "adv/ratio_final_to_reasoning": 1.1513285764714394, "adv/ratio_step_to_reasoning": 1.8600301548891336, "adv/std_final_conf": 0.7224934101104736, "adv/std_reasoning": 0.6815813779830933, "adv/std_step_conf": 0.9330750107765198, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7557434813248767, "calib/avg_num_step_conf": 6.8984375, "calib/ece": 0.20354581673306776, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7768924302788844, "calib/gap": 0.3175362931642002, "calib/mean_conf": 0.857808764940239, "calib/mu_c": 0.9666060606060607, "calib/mu_w": 0.6490697674418605, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20199203187250997, "calib/std_conf": 0.2910955732261504, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5770408888888889, "calib/step_q_c_n": 1125.0, "calib/step_q_gap": 0.15041062367828045, "calib/step_q_w": 0.4266302652106085, "calib/step_q_w_n": 641.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3006.0, "completions/max_terminated_length": 3006.0, "completions/mean_length": 586.81640625, "completions/mean_terminated_length": 586.81640625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.18346666666666667, "grad_norm": 0.039151061326265335, "learning_rate": 7.777777777777779e-07, "loss": 0.0318, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.02954472042620182, "mask/share_reasoning": 0.8346071243286133, "mask/share_step_conf": 0.13584816455841064, "num_tokens": 43262492.0, "reward": 1.4082257747650146, "reward_std": 0.24087859690189362, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7774074077606201, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8570220470428467, "step": 172 }, { "adv/mean_abs_final_conf": 0.5621016025543213, "adv/mean_abs_reasoning": 0.464821457862854, "adv/mean_abs_step_conf": 0.7540398240089417, "adv/ratio_final_to_reasoning": 1.2092849696284242, "adv/ratio_step_to_reasoning": 1.6222138871898246, "adv/std_final_conf": 0.8106486797332764, "adv/std_reasoning": 0.7574560046195984, "adv/std_step_conf": 0.9359198808670044, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6301006889242183, "calib/avg_num_step_conf": 6.94140625, "calib/ece": 0.31708, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.872, "calib/gap": 0.16937334393216774, "calib/mean_conf": 0.9077200000000001, "calib/mu_c": 0.9768243243243244, "calib/mu_w": 0.8074509803921567, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3164, "calib/std_conf": 0.2519984158680368, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5796259124087592, "calib/step_q_c_n": 1096.0, "calib/step_q_gap": 0.038377747944735585, "calib/step_q_w": 0.5412481644640236, "calib/step_q_w_n": 681.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1650.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 565.44140625, "completions/mean_terminated_length": 572.146240234375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.18453333333333333, "grad_norm": 0.029890544712543488, "learning_rate": 7.5e-07, "loss": -0.1264, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.030869608744978905, "mask/share_reasoning": 0.8199148774147034, "mask/share_step_conf": 0.13749675452709198, "num_tokens": 43510405.0, "reward": 1.2570515871047974, "reward_std": 0.3210733234882355, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6597632169723511, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7720918655395508, "step": 173 }, { "adv/mean_abs_final_conf": 0.6808509826660156, "adv/mean_abs_reasoning": 0.6322292685508728, "adv/mean_abs_step_conf": 0.7378787994384766, "adv/ratio_final_to_reasoning": 1.0769051933115785, "adv/ratio_step_to_reasoning": 1.1671063586312005, "adv/std_final_conf": 0.8761774301528931, "adv/std_reasoning": 0.8268210291862488, "adv/std_step_conf": 0.9360378980636597, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.5756345926800472, "calib/avg_num_step_conf": 7.52734375, "calib/ece": 0.3387982832618026, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.6781115879828327, "calib/gap": 0.12098878394332935, "calib/mean_conf": 0.7917596566523606, "calib/mu_c": 0.8499173553719008, "calib/mu_w": 0.7289285714285715, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.30562231759656655, "calib/std_conf": 0.3279849948204185, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5179640718562875, "calib/step_q_c_n": 835.0, "calib/step_q_gap": 0.10083952973174531, "calib/step_q_w": 0.41712454212454214, "calib/step_q_w_n": 1092.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3071.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 609.5390625, "completions/mean_terminated_length": 639.516357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.1856, "grad_norm": 0.0357220433652401, "learning_rate": 7.222222222222222e-07, "loss": -0.2293, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.02538606896996498, "mask/share_reasoning": 0.8105248808860779, "mask/share_step_conf": 0.11721404641866684, "num_tokens": 43770679.0, "reward": 1.1440212726593018, "reward_std": 0.38225239515304565, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.5724663734436035, "rewards/format_reward_step": 0.91015625, "rewards/step_l2_reward": 0.7195067405700684, "step": 174 }, { "adv/mean_abs_final_conf": 0.6520330905914307, "adv/mean_abs_reasoning": 0.5189327001571655, "adv/mean_abs_step_conf": 0.7571752071380615, "adv/ratio_final_to_reasoning": 1.2564887323422747, "adv/ratio_step_to_reasoning": 1.459100971876972, "adv/std_final_conf": 0.8270578384399414, "adv/std_reasoning": 0.7576694488525391, "adv/std_step_conf": 0.9357485175132751, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.75656194750442, "calib/avg_num_step_conf": 7.09765625, "calib/ece": 0.23403292181069957, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5473251028806584, "calib/gap": 0.3640922072623417, "calib/mean_conf": 0.663909465020576, "calib/mu_c": 0.8571929824561402, "calib/mu_w": 0.49310077519379847, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.21440329218106996, "calib/std_conf": 0.39867365496285584, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5756865284974093, "calib/step_q_c_n": 772.0, "calib/step_q_gap": 0.20009801175099784, "calib/step_q_w": 0.37558851674641147, "calib/step_q_w_n": 1045.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 621.71875, "completions/mean_terminated_length": 636.6400146484375, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.18666666666666668, "grad_norm": 0.03226575627923012, "learning_rate": 6.944444444444446e-07, "loss": -0.1542, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.02714480832219124, "mask/share_reasoning": 0.8222717046737671, "mask/share_step_conf": 0.12714600563049316, "num_tokens": 44035663.0, "reward": 1.2900152206420898, "reward_std": 0.3247193992137909, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.6946007609367371, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.803652286529541, "step": 175 }, { "adv/mean_abs_final_conf": 0.5556491017341614, "adv/mean_abs_reasoning": 0.460569828748703, "adv/mean_abs_step_conf": 0.7447050213813782, "adv/ratio_final_to_reasoning": 1.2064383445258975, "adv/ratio_step_to_reasoning": 1.6169209854771134, "adv/std_final_conf": 0.7920377850532532, "adv/std_reasoning": 0.7206613421440125, "adv/std_step_conf": 0.9342063069343567, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7433612858141161, "calib/avg_num_step_conf": 6.44140625, "calib/ece": 0.18899598393574296, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6184738955823293, "calib/gap": 0.3336415094339622, "calib/mean_conf": 0.7320481927710843, "calib/mu_c": 0.8526415094339622, "calib/mu_w": 0.519, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14124497991967871, "calib/std_conf": 0.3658782437419884, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5316715257531585, "calib/step_q_c_n": 1029.0, "calib/step_q_gap": 0.15713926768864228, "calib/step_q_w": 0.3745322580645162, "calib/step_q_w_n": 620.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2412.0, "completions/max_terminated_length": 2412.0, "completions/mean_length": 562.90234375, "completions/mean_terminated_length": 567.3346557617188, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.18773333333333334, "grad_norm": 0.03709743171930313, "learning_rate": 6.666666666666667e-07, "loss": -0.0819, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03193432092666626, "mask/share_reasoning": 0.8266017436981201, "mask/share_step_conf": 0.13365145027637482, "num_tokens": 44283830.0, "reward": 1.4114222526550293, "reward_std": 0.22874104976654053, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7592554688453674, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8724194765090942, "step": 176 }, { "adv/mean_abs_final_conf": 0.5837680101394653, "adv/mean_abs_reasoning": 0.45486849546432495, "adv/mean_abs_step_conf": 0.7547392845153809, "adv/ratio_final_to_reasoning": 1.283377538696236, "adv/ratio_step_to_reasoning": 1.6592472154945594, "adv/std_final_conf": 0.7963255047798157, "adv/std_reasoning": 0.720579981803894, "adv/std_step_conf": 0.9347895979881287, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7630876068376069, "calib/avg_num_step_conf": 7.2578125, "calib/ece": 0.19181451612903228, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6653225806451613, "calib/gap": 0.37500534188034207, "calib/mean_conf": 0.7494758064516129, "calib/mu_c": 0.9067361111111113, "calib/mu_w": 0.5317307692307692, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18032258064516132, "calib/std_conf": 0.3656701308227984, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5499669239250276, "calib/step_q_c_n": 907.0, "calib/step_q_gap": 0.1828691321269203, "calib/step_q_w": 0.36709779179810725, "calib/step_q_w_n": 951.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2871.0, "completions/max_terminated_length": 2871.0, "completions/mean_length": 602.71484375, "completions/mean_terminated_length": 605.0784912109375, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.1888, "grad_norm": 0.04204968363046646, "learning_rate": 6.388888888888889e-07, "loss": -0.0151, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0288112610578537, "mask/share_reasoning": 0.8356721997261047, "mask/share_step_conf": 0.13161027431488037, "num_tokens": 44541957.0, "reward": 1.3634679317474365, "reward_std": 0.2572265863418579, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.752631664276123, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8340271711349487, "step": 177 }, { "adv/mean_abs_final_conf": 0.5842238664627075, "adv/mean_abs_reasoning": 0.5355183482170105, "adv/mean_abs_step_conf": 0.7687432169914246, "adv/ratio_final_to_reasoning": 1.0909502324390197, "adv/ratio_step_to_reasoning": 1.4355123770285894, "adv/std_final_conf": 0.8104052543640137, "adv/std_reasoning": 0.7754615545272827, "adv/std_step_conf": 0.9334971904754639, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8196256381168463, "calib/avg_num_step_conf": 6.71484375, "calib/ece": 0.11960000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.632, "calib/gap": 0.4809600113442996, "calib/mean_conf": 0.7414400000000001, "calib/mu_c": 0.9068902439024391, "calib/mu_w": 0.4259302325581395, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.10252000000000006, "calib/std_conf": 0.36475022467436535, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5849324324324324, "calib/step_q_c_n": 1036.0, "calib/step_q_gap": 0.2448592259902655, "calib/step_q_w": 0.34007320644216693, "calib/step_q_w_n": 683.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3070.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 577.5703125, "completions/mean_terminated_length": 579.8353271484375, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.18986666666666666, "grad_norm": 0.03877848759293556, "learning_rate": 6.111111111111112e-07, "loss": 0.0605, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.030248302966356277, "mask/share_reasoning": 0.8328169584274292, "mask/share_step_conf": 0.13302844762802124, "num_tokens": 44795887.0, "reward": 1.4552483558654785, "reward_std": 0.23271851241588593, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.8311171531677246, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8783615827560425, "step": 178 }, { "adv/mean_abs_final_conf": 0.5848767757415771, "adv/mean_abs_reasoning": 0.4726482927799225, "adv/mean_abs_step_conf": 0.7531768679618835, "adv/ratio_final_to_reasoning": 1.2374460770852953, "adv/ratio_step_to_reasoning": 1.5935249940965779, "adv/std_final_conf": 0.828561544418335, "adv/std_reasoning": 0.7392279505729675, "adv/std_step_conf": 0.9353491067886353, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7433426144070822, "calib/avg_num_step_conf": 6.70703125, "calib/ece": 0.2012096774193548, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6895161290322581, "calib/gap": 0.3211594202898551, "calib/mean_conf": 0.7751612903225806, "calib/mu_c": 0.8878260869565218, "calib/mu_w": 0.5666666666666667, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.16358870967741934, "calib/std_conf": 0.3512752892307421, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5760626185958255, "calib/step_q_c_n": 1054.0, "calib/step_q_gap": 0.19030092930472442, "calib/step_q_w": 0.3857616892911011, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2766.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 576.4140625, "completions/mean_terminated_length": 580.9527587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 193.0, "epoch": 0.19093333333333334, "grad_norm": 0.03158321604132652, "learning_rate": 5.833333333333334e-07, "loss": -0.0242, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.029713882133364677, "mask/share_reasoning": 0.8328595161437988, "mask/share_step_conf": 0.12961412966251373, "num_tokens": 45049713.0, "reward": 1.3898929357528687, "reward_std": 0.2539806663990021, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7549265623092651, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8522734045982361, "step": 179 }, { "adv/mean_abs_final_conf": 0.5628108978271484, "adv/mean_abs_reasoning": 0.46406090259552, "adv/mean_abs_step_conf": 0.7706776857376099, "adv/ratio_final_to_reasoning": 1.2127953350073533, "adv/ratio_step_to_reasoning": 1.6607253087410814, "adv/std_final_conf": 0.7955601811408997, "adv/std_reasoning": 0.720676839351654, "adv/std_step_conf": 0.9343445897102356, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7493001119820828, "calib/avg_num_step_conf": 7.2734375, "calib/ece": 0.17995528455284543, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6544715447154471, "calib/gap": 0.3767978723404255, "calib/mean_conf": 0.7510203252032519, "calib/mu_c": 0.8949999999999999, "calib/mu_w": 0.5182021276595744, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.15654471544715437, "calib/std_conf": 0.3682110393586609, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5186773905272565, "calib/step_q_c_n": 1119.0, "calib/step_q_gap": 0.15622786159051355, "calib/step_q_w": 0.3624495289367429, "calib/step_q_w_n": 743.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2983.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 630.296875, "completions/mean_terminated_length": 642.8526000976562, "completions/min_length": 0.0, "completions/min_terminated_length": 243.0, "epoch": 0.192, "grad_norm": 0.043362926691770554, "learning_rate": 5.555555555555555e-07, "loss": -0.189, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.025322385132312775, "mask/share_reasoning": 0.8317726254463196, "mask/share_step_conf": 0.12337373197078705, "num_tokens": 45314925.0, "reward": 1.3708226680755615, "reward_std": 0.25812745094299316, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7541399002075195, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8386746048927307, "step": 180 }, { "adv/mean_abs_final_conf": 0.5692725777626038, "adv/mean_abs_reasoning": 0.49015751481056213, "adv/mean_abs_step_conf": 0.7483924627304077, "adv/ratio_final_to_reasoning": 1.1614074263099248, "adv/ratio_step_to_reasoning": 1.5268407402050117, "adv/std_final_conf": 0.8138942122459412, "adv/std_reasoning": 0.7575979232788086, "adv/std_step_conf": 0.9353100061416626, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7661928868120457, "calib/avg_num_step_conf": 6.49609375, "calib/ece": 0.21764940239043823, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6533864541832669, "calib/gap": 0.33786928868120447, "calib/mean_conf": 0.7564541832669323, "calib/mu_c": 0.900486111111111, "calib/mu_w": 0.5626168224299065, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.20019920318725096, "calib/std_conf": 0.35548824078313745, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5786396103896104, "calib/step_q_c_n": 924.0, "calib/step_q_gap": 0.1654731692529392, "calib/step_q_w": 0.4131664411366712, "calib/step_q_w_n": 739.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 518.62109375, "completions/mean_terminated_length": 524.770751953125, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.19306666666666666, "grad_norm": 0.05061223730444908, "learning_rate": 5.277777777777779e-07, "loss": -0.1146, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030669106170535088, "mask/share_reasoning": 0.8251529335975647, "mask/share_step_conf": 0.13245923817157745, "num_tokens": 45553956.0, "reward": 1.358069896697998, "reward_std": 0.26986604928970337, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7428753972053528, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8335071206092834, "step": 181 }, { "adv/mean_abs_final_conf": 0.5297540426254272, "adv/mean_abs_reasoning": 0.39702683687210083, "adv/mean_abs_step_conf": 0.7356059551239014, "adv/ratio_final_to_reasoning": 1.3343028567010533, "adv/ratio_step_to_reasoning": 1.8527864789172204, "adv/std_final_conf": 0.7600104808807373, "adv/std_reasoning": 0.681530237197876, "adv/std_step_conf": 0.9350858330726624, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.800428746427113, "calib/avg_num_step_conf": 7.05078125, "calib/ece": 0.20266932270916346, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.701195219123506, "calib/gap": 0.35521709541309365, "calib/mean_conf": 0.7966135458167332, "calib/mu_c": 0.9282278481012658, "calib/mu_w": 0.5730107526881721, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18490039840637462, "calib/std_conf": 0.3345605520735549, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5788622754491017, "calib/step_q_c_n": 1002.0, "calib/step_q_gap": 0.21201296038060852, "calib/step_q_w": 0.3668493150684932, "calib/step_q_w_n": 803.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2697.0, "completions/max_terminated_length": 2697.0, "completions/mean_length": 556.67578125, "completions/mean_terminated_length": 561.05908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 214.0, "epoch": 0.19413333333333332, "grad_norm": 0.04269418865442276, "learning_rate": 5.000000000000001e-07, "loss": -0.0214, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.028990257531404495, "mask/share_reasoning": 0.8292310237884521, "mask/share_step_conf": 0.13396620750427246, "num_tokens": 45802625.0, "reward": 1.417762041091919, "reward_std": 0.26264894008636475, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7771191596984863, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8694368600845337, "step": 182 }, { "adv/mean_abs_final_conf": 0.6165659427642822, "adv/mean_abs_reasoning": 0.5332478284835815, "adv/mean_abs_step_conf": 0.7585545182228088, "adv/ratio_final_to_reasoning": 1.156246513966378, "adv/ratio_step_to_reasoning": 1.42251778198505, "adv/std_final_conf": 0.843794047832489, "adv/std_reasoning": 0.7929291129112244, "adv/std_step_conf": 0.9355603456497192, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.682703737947431, "calib/avg_num_step_conf": 6.8203125, "calib/ece": 0.2678137651821862, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6072874493927125, "calib/gap": 0.27145951657641, "calib/mean_conf": 0.7070040485829959, "calib/mu_c": 0.8311940298507462, "calib/mu_w": 0.5597345132743362, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2161538461538461, "calib/std_conf": 0.38714559604135523, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5481409477521264, "calib/step_q_c_n": 823.0, "calib/step_q_gap": 0.16593076357011127, "calib/step_q_w": 0.38221018418201513, "calib/step_q_w_n": 923.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2836.0, "completions/max_terminated_length": 2836.0, "completions/mean_length": 595.52734375, "completions/mean_terminated_length": 607.3904418945312, "completions/min_length": 0.0, "completions/min_terminated_length": 218.0, "epoch": 0.1952, "grad_norm": 0.03282076120376587, "learning_rate": 4.7222222222222226e-07, "loss": -0.1704, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.0280508603900671, "mask/share_reasoning": 0.8383486270904541, "mask/share_step_conf": 0.11406920850276947, "num_tokens": 46061760.0, "reward": 1.2791484594345093, "reward_std": 0.3096787929534912, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6807718276977539, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7903251051902771, "step": 183 }, { "adv/mean_abs_final_conf": 0.6084942817687988, "adv/mean_abs_reasoning": 0.43991637229919434, "adv/mean_abs_step_conf": 0.7658754587173462, "adv/ratio_final_to_reasoning": 1.3832044454007089, "adv/ratio_step_to_reasoning": 1.740956933961216, "adv/std_final_conf": 0.814765214920044, "adv/std_reasoning": 0.7014064788818359, "adv/std_step_conf": 0.93467777967453, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6863379443700729, "calib/avg_num_step_conf": 6.9765625, "calib/ece": 0.21659919028340083, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7894736842105263, "calib/gap": 0.2433556448014279, "calib/mean_conf": 0.8442914979757083, "calib/mu_c": 0.9240963855421687, "calib/mu_w": 0.6807407407407408, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19441295546558707, "calib/std_conf": 0.30395137990264104, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.541272051996286, "calib/step_q_c_n": 1077.0, "calib/step_q_gap": 0.14540463309642704, "calib/step_q_w": 0.395867418899859, "calib/step_q_w_n": 709.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2915.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 576.0234375, "completions/mean_terminated_length": 585.1666870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 214.0, "epoch": 0.19626666666666667, "grad_norm": 0.031147437170147896, "learning_rate": 4.444444444444445e-07, "loss": -0.0018, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02811710722744465, "mask/share_reasoning": 0.8329929113388062, "mask/share_step_conf": 0.12326496839523315, "num_tokens": 46314502.0, "reward": 1.3443111181259155, "reward_std": 0.27114933729171753, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.734431266784668, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8157672882080078, "step": 184 }, { "adv/mean_abs_final_conf": 0.5487991571426392, "adv/mean_abs_reasoning": 0.48162105679512024, "adv/mean_abs_step_conf": 0.740757405757904, "adv/ratio_final_to_reasoning": 1.139483312450137, "adv/ratio_step_to_reasoning": 1.538050289344013, "adv/std_final_conf": 0.7968400120735168, "adv/std_reasoning": 0.7392687797546387, "adv/std_step_conf": 0.9352030158042908, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.770456240254898, "calib/avg_num_step_conf": 6.73046875, "calib/ece": 0.1746774193548388, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.657258064516129, "calib/gap": 0.40739610873839066, "calib/mean_conf": 0.7407258064516129, "calib/mu_c": 0.9033557046979865, "calib/mu_w": 0.4959595959595959, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1572983870967743, "calib/std_conf": 0.37258074116742235, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5475188781014024, "calib/step_q_c_n": 927.0, "calib/step_q_gap": 0.175898275086327, "calib/step_q_w": 0.37162060301507543, "calib/step_q_w_n": 796.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2862.0, "completions/max_terminated_length": 2862.0, "completions/mean_length": 566.06640625, "completions/mean_terminated_length": 577.3426513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.19733333333333333, "grad_norm": 0.03484448790550232, "learning_rate": 4.1666666666666667e-07, "loss": -0.0444, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.030003804713487625, "mask/share_reasoning": 0.8277981877326965, "mask/share_step_conf": 0.12266676127910614, "num_tokens": 46566335.0, "reward": 1.3510828018188477, "reward_std": 0.2744816243648529, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.766825795173645, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8133730888366699, "step": 185 }, { "adv/mean_abs_final_conf": 0.5438560843467712, "adv/mean_abs_reasoning": 0.4110875725746155, "adv/mean_abs_step_conf": 0.758434534072876, "adv/ratio_final_to_reasoning": 1.3229689259167698, "adv/ratio_step_to_reasoning": 1.8449463926210379, "adv/std_final_conf": 0.760007381439209, "adv/std_reasoning": 0.6817102432250977, "adv/std_step_conf": 0.9331895709037781, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7306553493880774, "calib/avg_num_step_conf": 6.9921875, "calib/ece": 0.2259760956175299, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6613545816733067, "calib/gap": 0.31593499144624304, "calib/mean_conf": 0.753625498007968, "calib/mu_c": 0.882013422818792, "calib/mu_w": 0.566078431372549, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19298804780876494, "calib/std_conf": 0.3672379993879625, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5677588168373151, "calib/step_q_c_n": 879.0, "calib/step_q_gap": 0.18072259290756754, "calib/step_q_w": 0.38703622392974757, "calib/step_q_w_n": 911.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2909.0, "completions/max_terminated_length": 2909.0, "completions/mean_length": 564.75, "completions/mean_terminated_length": 571.4466552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.1984, "grad_norm": 0.04410063102841377, "learning_rate": 3.8888888888888895e-07, "loss": -0.037, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030299216508865356, "mask/share_reasoning": 0.832628607749939, "mask/share_step_conf": 0.12535345554351807, "num_tokens": 46815951.0, "reward": 1.3567540645599365, "reward_std": 0.24842774868011475, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.732350766658783, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8347193598747253, "step": 186 }, { "adv/mean_abs_final_conf": 0.6464817523956299, "adv/mean_abs_reasoning": 0.5685774683952332, "adv/mean_abs_step_conf": 0.7855064868927002, "adv/ratio_final_to_reasoning": 1.137016129429602, "adv/ratio_step_to_reasoning": 1.3815293967060158, "adv/std_final_conf": 0.8438383936882019, "adv/std_reasoning": 0.7929400205612183, "adv/std_step_conf": 0.9357006549835205, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6160920320347497, "calib/avg_num_step_conf": 7.67578125, "calib/ece": 0.31314285714285706, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6081632653061224, "calib/gap": 0.13770598615447271, "calib/mean_conf": 0.7271836734693878, "calib/mu_c": 0.7867625899280575, "calib/mu_w": 0.6490566037735848, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.23648979591836725, "calib/std_conf": 0.36267237553795345, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5256306306306306, "calib/step_q_c_n": 888.0, "calib/step_q_gap": 0.1536343446510578, "calib/step_q_w": 0.37199628597957285, "calib/step_q_w_n": 1077.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2661.0, "completions/max_terminated_length": 2661.0, "completions/mean_length": 575.4140625, "completions/mean_terminated_length": 593.9757690429688, "completions/min_length": 0.0, "completions/min_terminated_length": 207.0, "epoch": 0.19946666666666665, "grad_norm": 0.03723229467868805, "learning_rate": 3.611111111111111e-07, "loss": -0.2321, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.027325410395860672, "mask/share_reasoning": 0.8210077285766602, "mask/share_step_conf": 0.12041684985160828, "num_tokens": 47064801.0, "reward": 1.249746561050415, "reward_std": 0.3241482675075531, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6286714673042297, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.786191999912262, "step": 187 }, { "adv/mean_abs_final_conf": 0.5331639647483826, "adv/mean_abs_reasoning": 0.4191775321960449, "adv/mean_abs_step_conf": 0.7717061042785645, "adv/ratio_final_to_reasoning": 1.2719287743194867, "adv/ratio_step_to_reasoning": 1.8410006381679007, "adv/std_final_conf": 0.7863588929176331, "adv/std_reasoning": 0.7014192342758179, "adv/std_step_conf": 0.9349312782287598, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7297077922077922, "calib/avg_num_step_conf": 7.03125, "calib/ece": 0.20650602409638558, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7188755020080321, "calib/gap": 0.3225232919254659, "calib/mean_conf": 0.812289156626506, "calib/mu_c": 0.9262732919254658, "calib/mu_w": 0.6037499999999999, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18610441767068275, "calib/std_conf": 0.3238820449469957, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5754664179104477, "calib/step_q_c_n": 1072.0, "calib/step_q_gap": 0.17839224208627186, "calib/step_q_w": 0.39707417582417587, "calib/step_q_w_n": 728.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2316.0, "completions/max_terminated_length": 2316.0, "completions/mean_length": 551.86328125, "completions/mean_terminated_length": 560.623046875, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.20053333333333334, "grad_norm": 0.041271813213825226, "learning_rate": 3.3333333333333335e-07, "loss": -0.0684, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0306197889149189, "mask/share_reasoning": 0.8181106448173523, "mask/share_step_conf": 0.1356445550918579, "num_tokens": 47310150.0, "reward": 1.390303134918213, "reward_std": 0.2478947937488556, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7650250196456909, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8476343154907227, "step": 188 }, { "adv/mean_abs_final_conf": 0.5081390738487244, "adv/mean_abs_reasoning": 0.32521361112594604, "adv/mean_abs_step_conf": 0.7686993479728699, "adv/ratio_final_to_reasoning": 1.5624778805827302, "adv/ratio_step_to_reasoning": 2.3636752020049197, "adv/std_final_conf": 0.745349645614624, "adv/std_reasoning": 0.5961154699325562, "adv/std_step_conf": 0.9345230460166931, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7175714472138849, "calib/avg_num_step_conf": 6.1796875, "calib/ece": 0.22227450980392155, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5764705882352941, "calib/gap": 0.3352218452303275, "calib/mean_conf": 0.6709019607843137, "calib/mu_c": 0.7984177215189874, "calib/mu_w": 0.4631958762886599, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13678431372549016, "calib/std_conf": 0.40453746305597355, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.530032119914347, "calib/step_q_c_n": 934.0, "calib/step_q_gap": 0.18185310756866796, "calib/step_q_w": 0.348179012345679, "calib/step_q_w_n": 648.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2624.0, "completions/max_terminated_length": 2624.0, "completions/mean_length": 544.19921875, "completions/mean_terminated_length": 544.19921875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.2016, "grad_norm": 0.03426051139831543, "learning_rate": 3.055555555555556e-07, "loss": 0.0191, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03142692893743515, "mask/share_reasoning": 0.841963529586792, "mask/share_step_conf": 0.12660948932170868, "num_tokens": 47557233.0, "reward": 1.3983986377716064, "reward_std": 0.1729370504617691, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7530906200408936, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8605251312255859, "step": 189 }, { "adv/mean_abs_final_conf": 0.6338986754417419, "adv/mean_abs_reasoning": 0.41939041018486023, "adv/mean_abs_step_conf": 0.7677431106567383, "adv/ratio_final_to_reasoning": 1.5114763238442435, "adv/ratio_step_to_reasoning": 1.8306167523437888, "adv/std_final_conf": 0.8425524234771729, "adv/std_reasoning": 0.7013546228408813, "adv/std_step_conf": 0.9354836344718933, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7676586003924133, "calib/avg_num_step_conf": 7.09765625, "calib/ece": 0.19530120481927704, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5943775100401606, "calib/gap": 0.3700019620667102, "calib/mean_conf": 0.7113654618473896, "calib/mu_c": 0.874820143884892, "calib/mu_w": 0.5048181818181818, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17421686746987947, "calib/std_conf": 0.3776525237392623, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5215325248070563, "calib/step_q_c_n": 907.0, "calib/step_q_gap": 0.1547523050268365, "calib/step_q_w": 0.36678021978021974, "calib/step_q_w_n": 910.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2315.0, "completions/max_terminated_length": 2315.0, "completions/mean_length": 608.9609375, "completions/mean_terminated_length": 616.1818237304688, "completions/min_length": 0.0, "completions/min_terminated_length": 195.0, "epoch": 0.20266666666666666, "grad_norm": 0.04120016098022461, "learning_rate": 2.7777777777777776e-07, "loss": -0.0373, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.027381490916013718, "mask/share_reasoning": 0.8419528007507324, "mask/share_step_conf": 0.11894698441028595, "num_tokens": 47818735.0, "reward": 1.3498507738113403, "reward_std": 0.25871074199676514, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7487621307373047, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8239072561264038, "step": 190 }, { "adv/mean_abs_final_conf": 0.6130039691925049, "adv/mean_abs_reasoning": 0.45739710330963135, "adv/mean_abs_step_conf": 0.7466780543327332, "adv/ratio_final_to_reasoning": 1.3402008118480293, "adv/ratio_step_to_reasoning": 1.6324503345778194, "adv/std_final_conf": 0.8566485643386841, "adv/std_reasoning": 0.7392563223838806, "adv/std_step_conf": 0.9357996582984924, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6787549407114625, "calib/avg_num_step_conf": 7.6796875, "calib/ece": 0.2614979757085019, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.659919028340081, "calib/gap": 0.3105711462450593, "calib/mean_conf": 0.7515384615384615, "calib/mu_c": 0.8961363636363637, "calib/mu_w": 0.5855652173913044, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23931174089068816, "calib/std_conf": 0.36746431364155935, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.544176072234763, "calib/step_q_c_n": 886.0, "calib/step_q_gap": 0.18581496112365198, "calib/step_q_w": 0.3583611111111111, "calib/step_q_w_n": 1080.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2777.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 549.3671875, "completions/mean_terminated_length": 562.552001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.20373333333333332, "grad_norm": 0.06811851263046265, "learning_rate": 2.5000000000000004e-07, "loss": -0.1334, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.030976133421063423, "mask/share_reasoning": 0.8067940473556519, "mask/share_step_conf": 0.1387922763824463, "num_tokens": 48063541.0, "reward": 1.3004114627838135, "reward_std": 0.2827233374118805, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.698123037815094, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.803303062915802, "step": 191 }, { "adv/mean_abs_final_conf": 0.5427058935165405, "adv/mean_abs_reasoning": 0.3278455138206482, "adv/mean_abs_step_conf": 0.7394140958786011, "adv/ratio_final_to_reasoning": 1.6553708092325286, "adv/ratio_step_to_reasoning": 2.25537353633915, "adv/std_final_conf": 0.7968785762786865, "adv/std_reasoning": 0.6404013633728027, "adv/std_step_conf": 0.9348773956298828, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.8160141301148072, "calib/avg_num_step_conf": 7.12890625, "calib/ece": 0.14954918032786899, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6680327868852459, "calib/gap": 0.43998086546953197, "calib/mean_conf": 0.7689754098360656, "calib/mu_c": 0.9240506329113924, "calib/mu_w": 0.4840697674418605, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.13549180327868865, "calib/std_conf": 0.35132561523452, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5471717171717172, "calib/step_q_c_n": 990.0, "calib/step_q_gap": 0.2089202201657292, "calib/step_q_w": 0.338251497005988, "calib/step_q_w_n": 835.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2617.0, "completions/max_terminated_length": 2617.0, "completions/mean_length": 587.4296875, "completions/mean_terminated_length": 599.1314697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.2048, "grad_norm": 0.04900625720620155, "learning_rate": 2.2222222222222224e-07, "loss": -0.1622, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.030505452305078506, "mask/share_reasoning": 0.8191900253295898, "mask/share_step_conf": 0.13077324628829956, "num_tokens": 48318899.0, "reward": 1.398130178451538, "reward_std": 0.27167192101478577, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7914214134216309, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8453881740570068, "step": 192 }, { "adv/mean_abs_final_conf": 0.701214075088501, "adv/mean_abs_reasoning": 0.630457878112793, "adv/mean_abs_step_conf": 0.7696429491043091, "adv/ratio_final_to_reasoning": 1.1122298561602704, "adv/ratio_step_to_reasoning": 1.2207682318256559, "adv/std_final_conf": 0.8996307253837585, "adv/std_reasoning": 0.8267195820808411, "adv/std_step_conf": 0.9354685544967651, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7785456730769231, "calib/avg_num_step_conf": 6.60546875, "calib/ece": 0.18568548387096784, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5604838709677419, "calib/gap": 0.3694871794871795, "calib/mean_conf": 0.6846370967741936, "calib/mu_c": 0.8395833333333333, "calib/mu_w": 0.47009615384615383, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.14483870967741946, "calib/std_conf": 0.37652039816819033, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.525945945945946, "calib/step_q_c_n": 888.0, "calib/step_q_gap": 0.14681020497458858, "calib/step_q_w": 0.37913574097135744, "calib/step_q_w_n": 803.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2824.0, "completions/max_terminated_length": 2824.0, "completions/mean_length": 587.8359375, "completions/mean_terminated_length": 597.1666870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 233.0, "epoch": 0.20586666666666667, "grad_norm": 0.04521409794688225, "learning_rate": 1.9444444444444447e-07, "loss": 0.0169, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.027636289596557617, "mask/share_reasoning": 0.8352762460708618, "mask/share_step_conf": 0.12146242707967758, "num_tokens": 48575097.0, "reward": 1.3805063962936401, "reward_std": 0.27369970083236694, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7593636512756348, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8476996421813965, "step": 193 }, { "adv/mean_abs_final_conf": 0.6136116981506348, "adv/mean_abs_reasoning": 0.4309585988521576, "adv/mean_abs_step_conf": 0.7773469686508179, "adv/ratio_final_to_reasoning": 1.423829805890791, "adv/ratio_step_to_reasoning": 1.803762520857579, "adv/std_final_conf": 0.8142480254173279, "adv/std_reasoning": 0.701364278793335, "adv/std_step_conf": 0.9353013634681702, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7455775577557754, "calib/avg_num_step_conf": 6.56640625, "calib/ece": 0.21454183266932267, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6334661354581673, "calib/gap": 0.3424349834983498, "calib/mean_conf": 0.7214741035856573, "calib/mu_c": 0.8592666666666666, "calib/mu_w": 0.5168316831683168, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.16920318725099598, "calib/std_conf": 0.3844339481429212, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5600668151447662, "calib/step_q_c_n": 898.0, "calib/step_q_gap": 0.16935161718818897, "calib/step_q_w": 0.3907151979565772, "calib/step_q_w_n": 783.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1768.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 536.14453125, "completions/mean_terminated_length": 546.82470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 189.0, "epoch": 0.20693333333333333, "grad_norm": 0.036717407405376434, "learning_rate": 1.6666666666666668e-07, "loss": -0.0308, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030489569529891014, "mask/share_reasoning": 0.8255350589752197, "mask/share_step_conf": 0.12444411218166351, "num_tokens": 48818294.0, "reward": 1.3486157655715942, "reward_std": 0.26706817746162415, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7385668158531189, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8234729766845703, "step": 194 }, { "adv/mean_abs_final_conf": 0.5755367279052734, "adv/mean_abs_reasoning": 0.46918901801109314, "adv/mean_abs_step_conf": 0.7148308753967285, "adv/ratio_final_to_reasoning": 1.2266628284374421, "adv/ratio_step_to_reasoning": 1.5235456243773966, "adv/std_final_conf": 0.8129864931106567, "adv/std_reasoning": 0.7393112182617188, "adv/std_step_conf": 0.9357689023017883, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6910894660894662, "calib/avg_num_step_conf": 6.88671875, "calib/ece": 0.24008368200836827, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.6317991631799164, "calib/gap": 0.2835815295815297, "calib/mean_conf": 0.7251046025104603, "calib/mu_c": 0.8425714285714286, "calib/mu_w": 0.5589898989898989, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.18970711297071136, "calib/std_conf": 0.37731917382077446, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.5250125142207055, "calib/step_q_c_n": 879.0, "calib/step_q_gap": 0.15727608888133898, "calib/step_q_w": 0.3677364253393665, "calib/step_q_w_n": 884.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3051.0, "completions/max_terminated_length": 3051.0, "completions/mean_length": 575.69921875, "completions/mean_terminated_length": 594.2701416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.208, "grad_norm": 0.04009755328297615, "learning_rate": 1.3888888888888888e-07, "loss": -0.1084, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.028049079701304436, "mask/share_reasoning": 0.817302942276001, "mask/share_step_conf": 0.12339800596237183, "num_tokens": 49071657.0, "reward": 1.2790379524230957, "reward_std": 0.27997830510139465, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.679855465888977, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7922351360321045, "step": 195 }, { "adv/mean_abs_final_conf": 0.526882529258728, "adv/mean_abs_reasoning": 0.3654212951660156, "adv/mean_abs_step_conf": 0.7486469745635986, "adv/ratio_final_to_reasoning": 1.441849547983673, "adv/ratio_step_to_reasoning": 2.048722897288946, "adv/std_final_conf": 0.7710081338882446, "adv/std_reasoning": 0.6402504444122314, "adv/std_step_conf": 0.9332286715507507, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7198146372643017, "calib/avg_num_step_conf": 6.4921875, "calib/ece": 0.24598425196850385, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7086614173228346, "calib/gap": 0.3077494407158837, "calib/mean_conf": 0.7851968503937008, "calib/mu_c": 0.9124161073825504, "calib/mu_w": 0.6046666666666667, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22228346456692905, "calib/std_conf": 0.3452862566700246, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6140914036996734, "calib/step_q_c_n": 919.0, "calib/step_q_gap": 0.19530270921784298, "calib/step_q_w": 0.4187886944818304, "calib/step_q_w_n": 743.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2571.0, "completions/max_terminated_length": 2571.0, "completions/mean_length": 482.88671875, "completions/mean_terminated_length": 484.7804260253906, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.20906666666666668, "grad_norm": 0.033003684133291245, "learning_rate": 1.1111111111111112e-07, "loss": 0.0064, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03300885111093521, "mask/share_reasoning": 0.8257684707641602, "mask/share_step_conf": 0.13731642067432404, "num_tokens": 49297820.0, "reward": 1.3620078563690186, "reward_std": 0.1846241056919098, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7422569990158081, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8334574699401855, "step": 196 }, { "adv/mean_abs_final_conf": 0.6287498474121094, "adv/mean_abs_reasoning": 0.4730614423751831, "adv/mean_abs_step_conf": 0.7427409291267395, "adv/ratio_final_to_reasoning": 1.329108211092483, "adv/ratio_step_to_reasoning": 1.570072854379188, "adv/std_final_conf": 0.8411765098571777, "adv/std_reasoning": 0.7207943201065063, "adv/std_step_conf": 0.9351040720939636, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7412587412587412, "calib/avg_num_step_conf": 7.07421875, "calib/ece": 0.21534412955465593, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5951417004048583, "calib/gap": 0.2999125874125874, "calib/mean_conf": 0.7295951417004047, "calib/mu_c": 0.8558741258741259, "calib/mu_w": 0.5559615384615385, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18299595141700412, "calib/std_conf": 0.35663269707665646, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5274426229508197, "calib/step_q_c_n": 915.0, "calib/step_q_gap": 0.14353637295081967, "calib/step_q_w": 0.38390625, "calib/step_q_w_n": 896.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1876.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 536.7734375, "completions/mean_terminated_length": 554.0886840820312, "completions/min_length": 0.0, "completions/min_terminated_length": 202.0, "epoch": 0.21013333333333334, "grad_norm": 0.03876349329948425, "learning_rate": 8.333333333333334e-08, "loss": -0.2178, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.029733015224337578, "mask/share_reasoning": 0.810360312461853, "mask/share_step_conf": 0.12865662574768066, "num_tokens": 49540290.0, "reward": 1.3443280458450317, "reward_std": 0.2846136689186096, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7261113524436951, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8289285898208618, "step": 197 }, { "adv/mean_abs_final_conf": 0.5739809274673462, "adv/mean_abs_reasoning": 0.41014930605888367, "adv/mean_abs_step_conf": 0.7451508045196533, "adv/ratio_final_to_reasoning": 1.3994438585858335, "adv/ratio_step_to_reasoning": 1.8167793862186241, "adv/std_final_conf": 0.8088400363922119, "adv/std_reasoning": 0.6613656878471375, "adv/std_step_conf": 0.9343599677085876, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7693077230892357, "calib/avg_num_step_conf": 7.0234375, "calib/ece": 0.17290836653386443, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6055776892430279, "calib/gap": 0.3876623982926503, "calib/mean_conf": 0.7000796812749004, "calib/mu_c": 0.851437908496732, "calib/mu_w": 0.46377551020408164, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13171314741035847, "calib/std_conf": 0.38553940278665894, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5258260869565218, "calib/step_q_c_n": 1035.0, "calib/step_q_gap": 0.122103937546299, "calib/step_q_w": 0.4037221494102228, "calib/step_q_w_n": 763.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1477.0, "completions/max_terminated_length": 1477.0, "completions/mean_length": 516.171875, "completions/mean_terminated_length": 526.4542236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.2112, "grad_norm": 0.026938652619719505, "learning_rate": 5.555555555555556e-08, "loss": -0.0684, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03219270706176758, "mask/share_reasoning": 0.8046937584877014, "mask/share_step_conf": 0.1435822695493698, "num_tokens": 49777814.0, "reward": 1.4053709506988525, "reward_std": 0.23416009545326233, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7742702960968018, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8604233264923096, "step": 198 }, { "adv/mean_abs_final_conf": 0.6629692316055298, "adv/mean_abs_reasoning": 0.5119156837463379, "adv/mean_abs_step_conf": 0.7785393595695496, "adv/ratio_final_to_reasoning": 1.2950750536763027, "adv/ratio_step_to_reasoning": 1.5208351380680256, "adv/std_final_conf": 0.8606733083724976, "adv/std_reasoning": 0.7576802372932434, "adv/std_step_conf": 0.9351163506507874, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7201812647018125, "calib/avg_num_step_conf": 7.4140625, "calib/ece": 0.23914285714285713, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6530612244897959, "calib/gap": 0.26443960149439594, "calib/mean_conf": 0.7648571428571428, "calib/mu_c": 0.8717123287671232, "calib/mu_w": 0.6072727272727273, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2040408163265306, "calib/std_conf": 0.3489361089988614, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5189263157894737, "calib/step_q_c_n": 950.0, "calib/step_q_gap": 0.15909509216078171, "calib/step_q_w": 0.35983122362869197, "calib/step_q_w_n": 948.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3031.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 599.40234375, "completions/mean_terminated_length": 618.7379150390625, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.21226666666666666, "grad_norm": 0.045903194695711136, "learning_rate": 2.777777777777778e-08, "loss": -0.0885, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.029736265540122986, "mask/share_reasoning": 0.8093239068984985, "mask/share_step_conf": 0.12968984246253967, "num_tokens": 50035461.0, "reward": 1.333052635192871, "reward_std": 0.30268052220344543, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.704621434211731, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8280073404312134, "step": 199 }, { "adv/mean_abs_final_conf": 0.5917401313781738, "adv/mean_abs_reasoning": 0.4564133584499359, "adv/mean_abs_step_conf": 0.753562331199646, "adv/ratio_final_to_reasoning": 1.2965004648151242, "adv/ratio_step_to_reasoning": 1.6510523131024974, "adv/std_final_conf": 0.8196789026260376, "adv/std_reasoning": 0.7206570506095886, "adv/std_step_conf": 0.9346625804901123, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7932464886107271, "calib/avg_num_step_conf": 6.26171875, "calib/ece": 0.17487603305785127, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6446280991735537, "calib/gap": 0.3713557965213594, "calib/mean_conf": 0.743801652892562, "calib/mu_c": 0.8834437086092716, "calib/mu_w": 0.5120879120879122, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.14735537190082648, "calib/std_conf": 0.36617919155496315, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5808118214716526, "calib/step_q_c_n": 829.0, "calib/step_q_gap": 0.22709089123909437, "calib/step_q_w": 0.3537209302325582, "calib/step_q_w_n": 774.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2845.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 552.8046875, "completions/mean_terminated_length": 568.3453369140625, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.21333333333333335, "grad_norm": 0.04398513212800026, "learning_rate": 0.0, "loss": -0.1976, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.030634820461273193, "mask/share_reasoning": 0.8216097354888916, "mask/share_step_conf": 0.12041172385215759, "num_tokens": 50285027.0, "reward": 1.3560163974761963, "reward_std": 0.26149389147758484, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7444355487823486, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8306736946105957, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.03754158863797784, "train_runtime": 17053.3275, "train_samples_per_second": 3.002, "train_steps_per_second": 0.012 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 50285027, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }