{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.007058217190206051, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": 0.0332, "num_tokens": 229171.0, "reward": 0.5959470272064209, "reward_std": 0.34700775146484375, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.14714524149894714, "step": 1 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.00732036679983139, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": 0.0149, "num_tokens": 458661.0, "reward": 0.5325330495834351, "reward_std": 0.35858410596847534, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.12673982977867126, "step": 2 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.43875927889713684, "calib/avg_num_step_conf": 5.11328125, "calib/ece": 0.2455859374999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.36328125, "calib/gap": -0.008799045599151345, "calib/mean_conf": 0.8862109374999999, "calib/mu_c": 0.883048780487805, "calib/mu_w": 0.8918478260869563, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2455859374999999, "calib/std_conf": 0.04548457024498631, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8016937669376695, "calib/step_q_c_n": 738.0, "calib/step_q_gap": 0.04516136763819478, "calib/step_q_w": 0.7565323992994747, "calib/step_q_w_n": 571.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 491.5390625, "completions/mean_terminated_length": 493.4667053222656, "completions/min_length": 0.0, "completions/min_terminated_length": 191.0, "epoch": 0.0032, "grad_norm": 0.007544770836830139, "kl": 0.0012790262699127197, "learning_rate": 7.5e-07, "loss": 0.0274, "num_tokens": 689751.0, "reward": 0.5081400275230408, "reward_std": 0.36799156665802, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6991492509841919, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": -0.009431745857000351, "step": 3 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.46746398290480456, "calib/avg_num_step_conf": 4.87109375, "calib/ece": 0.23257936507936522, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2777777777777778, "calib/gap": 0.0011642655269872293, "calib/mean_conf": 0.8794047619047619, "calib/mu_c": 0.8798159509202456, "calib/mu_w": 0.8786516853932583, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23257936507936522, "calib/std_conf": 0.05107614585828655, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7943381389252948, "calib/step_q_c_n": 763.0, "calib/step_q_gap": 0.013862932313724596, "calib/step_q_w": 0.7804752066115702, "calib/step_q_w_n": 484.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1832.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 481.61328125, "completions/mean_terminated_length": 487.3241271972656, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.004266666666666667, "grad_norm": 0.006945696193724871, "kl": 0.00028836727142333984, "learning_rate": 1.0000000000000002e-06, "loss": 0.0012, "num_tokens": 919212.0, "reward": 0.5910017490386963, "reward_std": 0.32807308435440063, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7042097449302673, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.15357500314712524, "step": 4 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4042075163398694, "calib/avg_num_step_conf": 4.78125, "calib/ece": 0.32217213114754095, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.27049180327868855, "calib/gap": -0.01393246187363828, "calib/mean_conf": 0.8795491803278689, "calib/mu_c": 0.8733823529411765, "calib/mu_w": 0.8873148148148148, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.32217213114754095, "calib/std_conf": 0.04728923948911935, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.7967194928684628, "calib/step_q_c_n": 631.0, "calib/step_q_gap": 0.011660470946034507, "calib/step_q_w": 0.7850590219224283, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3052.0, "completions/max_terminated_length": 3052.0, "completions/mean_length": 522.75, "completions/mean_terminated_length": 524.800048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.005333333333333333, "grad_norm": 0.006729537155479193, "kl": 0.000265657901763916, "learning_rate": 1.25e-06, "loss": 0.0257, "num_tokens": 1159724.0, "reward": 0.46391651034355164, "reward_std": 0.3454357981681824, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6057500243186951, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.026770520955324173, "step": 5 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4620373299588738, "calib/avg_num_step_conf": 5.12109375, "calib/ece": 0.31055118110236213, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.32677165354330706, "calib/gap": -0.005713381841189413, "calib/mean_conf": 0.8814173228346457, "calib/mu_c": 0.8789655172413793, "calib/mu_w": 0.8846788990825687, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.31055118110236213, "calib/std_conf": 0.041254432535867074, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7995592286501377, "calib/step_q_c_n": 726.0, "calib/step_q_gap": -0.006611711520802421, "calib/step_q_w": 0.8061709401709402, "calib/step_q_w_n": 585.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2127.0, "completions/max_terminated_length": 2127.0, "completions/mean_length": 440.34765625, "completions/mean_terminated_length": 440.34765625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.0064, "grad_norm": 0.007685510441660881, "kl": 0.0007127821445465088, "learning_rate": 1.5e-06, "loss": 0.0159, "num_tokens": 1378405.0, "reward": 0.5301942825317383, "reward_std": 0.31458911299705505, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6489687561988831, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.09970106184482574, "step": 6 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.48179140447181684, "calib/avg_num_step_conf": 5.40234375, "calib/ece": 0.2674900398406375, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.2788844621513944, "calib/gap": -0.005026107912705968, "calib/mean_conf": 0.8785657370517928, "calib/mu_c": 0.8766233766233766, "calib/mu_w": 0.8816494845360826, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.26625498007968135, "calib/std_conf": 0.0434692575958097, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7888520710059173, "calib/step_q_c_n": 845.0, "calib/step_q_gap": -0.012225995908580867, "calib/step_q_w": 0.8010780669144981, "calib/step_q_w_n": 538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2899.0, "completions/max_terminated_length": 2899.0, "completions/mean_length": 559.07421875, "completions/mean_terminated_length": 559.07421875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.007466666666666667, "grad_norm": 0.006892060860991478, "kl": 0.00038546323776245117, "learning_rate": 1.75e-06, "loss": 0.0366, "num_tokens": 1628952.0, "reward": 0.527165412902832, "reward_std": 0.33386415243148804, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6698726415634155, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.07039565593004227, "step": 7 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4465880102040817, "calib/avg_num_step_conf": 4.6484375, "calib/ece": 0.3209126984126984, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.27380952380952384, "calib/gap": -0.005874999999999964, "calib/mean_conf": 0.876468253968254, "calib/mu_c": 0.8738571428571429, "calib/mu_w": 0.8797321428571429, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3209126984126984, "calib/std_conf": 0.04910131839757406, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7993548387096774, "calib/step_q_c_n": 620.0, "calib/step_q_gap": 0.03326711941143179, "calib/step_q_w": 0.7660877192982456, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2512.0, "completions/max_terminated_length": 2512.0, "completions/mean_length": 515.6953125, "completions/mean_terminated_length": 517.7176513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.008533333333333334, "grad_norm": 0.006904206704348326, "kl": 0.00042426586151123047, "learning_rate": 2.0000000000000003e-06, "loss": -0.0211, "num_tokens": 1867482.0, "reward": 0.49654126167297363, "reward_std": 0.30775249004364014, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6335644721984863, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.05404934659600258, "step": 8 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5013470400567175, "calib/avg_num_step_conf": 4.91796875, "calib/ece": 0.23780487804878048, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.21544715447154472, "calib/gap": 0.014260191421481827, "calib/mean_conf": 0.8678861788617886, "calib/mu_c": 0.8731612903225807, "calib/mu_w": 0.8589010989010989, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.23780487804878048, "calib/std_conf": 0.07653313255108424, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.7960090361445783, "calib/step_q_c_n": 664.0, "calib/step_q_gap": 0.10142080085046057, "calib/step_q_w": 0.6945882352941177, "calib/step_q_w_n": 595.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2378.0, "completions/max_terminated_length": 2378.0, "completions/mean_length": 491.83203125, "completions/mean_terminated_length": 501.6294860839844, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0096, "grad_norm": 0.007615099661052227, "kl": 0.0003460049629211426, "learning_rate": 2.25e-06, "loss": 0.0029, "num_tokens": 2100927.0, "reward": 0.5577809810638428, "reward_std": 0.35230207443237305, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6672629117965698, "rewards/format_reward_step": 0.94140625, "rewards/step_correlation_reward": 0.13892391324043274, "step": 9 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.48801742919389984, "calib/avg_num_step_conf": 5.17578125, "calib/ece": 0.2692063492063492, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.2777777777777778, "calib/gap": -0.00218657159833624, "calib/mean_conf": 0.8763492063492063, "calib/mu_c": 0.8754901960784313, "calib/mu_w": 0.8776767676767675, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2692063492063492, "calib/std_conf": 0.04302275467708966, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.786162310866575, "calib/step_q_c_n": 727.0, "calib/step_q_gap": -0.007467271073224313, "calib/step_q_w": 0.7936295819397993, "calib/step_q_w_n": 598.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2478.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 523.890625, "completions/mean_terminated_length": 525.9451293945312, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.010666666666666666, "grad_norm": 0.0069535886868834496, "kl": 0.0003247857093811035, "learning_rate": 2.5e-06, "loss": 0.0169, "num_tokens": 2341843.0, "reward": 0.5617609620094299, "reward_std": 0.3579865097999573, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6715867519378662, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.1363101601600647, "step": 10 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5209415584415583, "calib/avg_num_step_conf": 5.5078125, "calib/ece": 0.29157480314960627, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3031496062992126, "calib/gap": -0.009902597402597246, "calib/mean_conf": 0.871496062992126, "calib/mu_c": 0.8675974025974027, "calib/mu_w": 0.8775, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2783858267716535, "calib/std_conf": 0.0895621786401363, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7814958775029447, "calib/step_q_c_n": 849.0, "calib/step_q_gap": -0.019627117149461748, "calib/step_q_w": 0.8011229946524064, "calib/step_q_w_n": 561.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2507.0, "completions/max_terminated_length": 2507.0, "completions/mean_length": 518.60546875, "completions/mean_terminated_length": 518.60546875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.011733333333333333, "grad_norm": 0.006948419380933046, "kl": 0.0003502368927001953, "learning_rate": 2.7500000000000004e-06, "loss": 0.0181, "num_tokens": 2579086.0, "reward": 0.5813484191894531, "reward_std": 0.3301823139190674, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6719730496406555, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.17275497317314148, "step": 11 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.48346796276013143, "calib/avg_num_step_conf": 5.63671875, "calib/ece": 0.22937007874015738, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2874015748031496, "calib/gap": -0.005866648411828934, "calib/mean_conf": 0.8737795275590551, "calib/mu_c": 0.8717469879518073, "calib/mu_w": 0.8776136363636362, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22480314960629913, "calib/std_conf": 0.05236048780560937, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7873589164785555, "calib/step_q_c_n": 886.0, "calib/step_q_gap": 0.007251196550368766, "calib/step_q_w": 0.7801077199281867, "calib/step_q_w_n": 557.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2107.0, "completions/max_terminated_length": 2107.0, "completions/mean_length": 456.671875, "completions/mean_terminated_length": 458.4627685546875, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.0128, "grad_norm": 0.007615354843437672, "kl": 0.0005735158920288086, "learning_rate": 3e-06, "loss": -0.0368, "num_tokens": 2800170.0, "reward": 0.575323760509491, "reward_std": 0.30451565980911255, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7140507698059082, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.10847172141075134, "step": 12 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5732429335370512, "calib/avg_num_step_conf": 4.75, "calib/ece": 0.26554687500000007, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.265625, "calib/gap": 0.02371530430353963, "calib/mean_conf": 0.867109375, "calib/mu_c": 0.8765584415584416, "calib/mu_w": 0.852843137254902, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26554687500000007, "calib/std_conf": 0.06147474511626197, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.782394366197183, "calib/step_q_c_n": 710.0, "calib/step_q_gap": 0.019232310861214685, "calib/step_q_w": 0.7631620553359684, "calib/step_q_w_n": 506.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 458.69921875, "completions/mean_terminated_length": 460.4980773925781, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.013866666666666666, "grad_norm": 0.007240649778395891, "kl": 0.0006138086318969727, "learning_rate": 3.2500000000000002e-06, "loss": 0.0001, "num_tokens": 3022189.0, "reward": 0.6061074137687683, "reward_std": 0.30182117223739624, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6973890662193298, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.1945132613182068, "step": 13 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.36378424100461965, "calib/avg_num_step_conf": 5.3515625, "calib/ece": 0.32348, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.308, "calib/gap": -0.01454811633808295, "calib/mean_conf": 0.8799600000000001, "calib/mu_c": 0.8736170212765958, "calib/mu_w": 0.8881651376146787, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31972, "calib/std_conf": 0.07140587090708998, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7922131147540982, "calib/step_q_c_n": 732.0, "calib/step_q_gap": 0.02354540315535214, "calib/step_q_w": 0.768667711598746, "calib/step_q_w_n": 638.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 560.66796875, "completions/mean_terminated_length": 560.66796875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.014933333333333333, "grad_norm": 0.006452104542404413, "kl": 0.002147197723388672, "learning_rate": 3.5e-06, "loss": 0.0202, "num_tokens": 3271120.0, "reward": 0.5175330638885498, "reward_std": 0.34320229291915894, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6269644498825073, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.1026330292224884, "step": 14 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.505960135487233, "calib/avg_num_step_conf": 4.97265625, "calib/ece": 0.2719367588932806, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2885375494071146, "calib/gap": 0.003715476810838947, "calib/mean_conf": 0.8727272727272728, "calib/mu_c": 0.8742105263157894, "calib/mu_w": 0.8704950495049505, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2719367588932806, "calib/std_conf": 0.0561343033018714, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7717847769028872, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.0021761663353725424, "calib/step_q_w": 0.7696086105675146, "calib/step_q_w_n": 511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 456.48046875, "completions/mean_terminated_length": 458.2706298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.016, "grad_norm": 0.007334915455430746, "kl": 0.0011022090911865234, "learning_rate": 3.7500000000000005e-06, "loss": 0.0079, "num_tokens": 3495859.0, "reward": 0.5454412698745728, "reward_std": 0.3382645547389984, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.675797700881958, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.09945990145206451, "step": 15 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.488537710255019, "calib/avg_num_step_conf": 6.421875, "calib/ece": 0.2714859437751005, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.321285140562249, "calib/gap": 0.0017986977753660094, "calib/mean_conf": 0.8790361445783132, "calib/mu_c": 0.8797368421052629, "calib/mu_w": 0.8779381443298969, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2700401606425704, "calib/std_conf": 0.05849411032373637, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7629069767441861, "calib/step_q_c_n": 946.0, "calib/step_q_gap": 0.04061471313387088, "calib/step_q_w": 0.7222922636103152, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2725.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 652.53125, "completions/mean_terminated_length": 652.53125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.017066666666666667, "grad_norm": 0.00565234012901783, "kl": 0.001227736473083496, "learning_rate": 4.000000000000001e-06, "loss": 0.0352, "num_tokens": 3771755.0, "reward": 0.5504010319709778, "reward_std": 0.34064918756484985, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6676726937294006, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.12062937021255493, "step": 16 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5512839059674502, "calib/avg_num_step_conf": 5.56640625, "calib/ece": 0.1828740157480315, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.25196850393700787, "calib/gap": 0.0065439421338154835, "calib/mean_conf": 0.8718503937007873, "calib/mu_c": 0.8738857142857144, "calib/mu_w": 0.8673417721518989, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1828740157480315, "calib/std_conf": 0.04479306378042393, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7513369477911647, "calib/step_q_c_n": 996.0, "calib/step_q_gap": -0.010970744516527575, "calib/step_q_w": 0.7623076923076922, "calib/step_q_w_n": 429.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2339.0, "completions/max_terminated_length": 2339.0, "completions/mean_length": 499.140625, "completions/mean_terminated_length": 499.140625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.018133333333333335, "grad_norm": 0.006812790408730507, "kl": 0.003714442253112793, "learning_rate": 4.25e-06, "loss": 0.0366, "num_tokens": 4003063.0, "reward": 0.6302497982978821, "reward_std": 0.3418027460575104, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7393969297409058, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.1875089406967163, "step": 17 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4798570705793069, "calib/avg_num_step_conf": 4.9921875, "calib/ece": 0.3447619047619048, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2619047619047619, "calib/gap": -0.006670882873766715, "calib/mean_conf": 0.8685714285714285, "calib/mu_c": 0.8654477611940299, "calib/mu_w": 0.8721186440677966, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.34079365079365087, "calib/std_conf": 0.07573150448112766, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7421671826625388, "calib/step_q_c_n": 646.0, "calib/step_q_gap": 0.03927161304228555, "calib/step_q_w": 0.7028955696202532, "calib/step_q_w_n": 632.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3067.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 477.2890625, "completions/mean_terminated_length": 479.16082763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.0192, "grad_norm": 0.00782696157693863, "kl": 0.002428770065307617, "learning_rate": 4.5e-06, "loss": 0.0213, "num_tokens": 4235969.0, "reward": 0.5036895275115967, "reward_std": 0.3225781321525574, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6186789274215698, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.0871376022696495, "step": 18 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5229340645643774, "calib/avg_num_step_conf": 4.29296875, "calib/ece": 0.3119215686274509, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.25882352941176473, "calib/gap": 0.010689268353483716, "calib/mean_conf": 0.8687843137254903, "calib/mu_c": 0.8735211267605634, "calib/mu_w": 0.8628318584070797, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3119215686274509, "calib/std_conf": 0.0641848781059075, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7364781906300485, "calib/step_q_c_n": 619.0, "calib/step_q_gap": 0.006269857296715031, "calib/step_q_w": 0.7302083333333335, "calib/step_q_w_n": 480.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1194.0, "completions/max_terminated_length": 1194.0, "completions/mean_length": 450.98828125, "completions/mean_terminated_length": 452.75689697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.020266666666666665, "grad_norm": 0.007010597735643387, "kl": 0.003600597381591797, "learning_rate": 4.75e-06, "loss": 0.0047, "num_tokens": 4456182.0, "reward": 0.5692561864852905, "reward_std": 0.31815648078918457, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6545273661613464, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.1730474829673767, "step": 19 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.3784722222222222, "calib/avg_num_step_conf": 4.7578125, "calib/ece": 0.2991699604743082, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.20948616600790515, "calib/gap": -0.02455147808358804, "calib/mean_conf": 0.8644664031620554, "calib/mu_c": 0.8538888888888889, "calib/mu_w": 0.878440366972477, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2972332015810276, "calib/std_conf": 0.05866273148199263, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7301494768310912, "calib/step_q_c_n": 669.0, "calib/step_q_gap": -0.004549976720821247, "calib/step_q_w": 0.7346994535519125, "calib/step_q_w_n": 549.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 450.96875, "completions/mean_terminated_length": 450.96875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.021333333333333333, "grad_norm": 0.007373226340860128, "kl": 0.0052947998046875, "learning_rate": 5e-06, "loss": 0.0243, "num_tokens": 4676502.0, "reward": 0.5320701599121094, "reward_std": 0.37527555227279663, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6444605588912964, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.10952349007129669, "step": 20 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.43324428227015055, "calib/avg_num_step_conf": 5.33984375, "calib/ece": 0.27456349206349207, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.23809523809523808, "calib/gap": -0.011166351729979707, "calib/mean_conf": 0.8654365079365078, "calib/mu_c": 0.8608724832214764, "calib/mu_w": 0.8720388349514561, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.27436507936507937, "calib/std_conf": 0.05632431542806724, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6966994382022472, "calib/step_q_c_n": 712.0, "calib/step_q_gap": -0.009270027446607765, "calib/step_q_w": 0.705969465648855, "calib/step_q_w_n": 655.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2947.0, "completions/max_terminated_length": 2947.0, "completions/mean_length": 480.9375, "completions/mean_terminated_length": 482.82354736328125, "completions/min_length": 0.0, "completions/min_terminated_length": 33.0, "epoch": 0.0224, "grad_norm": 0.007370475679636002, "kl": 0.006046295166015625, "learning_rate": 4.9722222222222224e-06, "loss": -0.0003, "num_tokens": 4902582.0, "reward": 0.5867160558700562, "reward_std": 0.38447660207748413, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6640527248382568, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.19609802961349487, "step": 21 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5082028085336213, "calib/avg_num_step_conf": 5.26953125, "calib/ece": 0.2203557312252964, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.1541501976284585, "calib/gap": 0.005776397515528053, "calib/mean_conf": 0.8567193675889329, "calib/mu_c": 0.8588198757763976, "calib/mu_w": 0.8530434782608696, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2203557312252964, "calib/std_conf": 0.05548838411002395, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7116451233842537, "calib/step_q_c_n": 851.0, "calib/step_q_gap": 0.022106970773811963, "calib/step_q_w": 0.6895381526104417, "calib/step_q_w_n": 498.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 452.34765625, "completions/mean_terminated_length": 452.34765625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.023466666666666667, "grad_norm": 0.007838176563382149, "kl": 0.008509159088134766, "learning_rate": 4.944444444444445e-06, "loss": 0.0488, "num_tokens": 5120199.0, "reward": 0.6486678123474121, "reward_std": 0.32256704568862915, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7099843621253967, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.26469504833221436, "step": 22 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5278980576518619, "calib/avg_num_step_conf": 4.8046875, "calib/ece": 0.3182745098039215, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.17254901960784313, "calib/gap": 0.004140170728689885, "calib/mean_conf": 0.8555294117647059, "calib/mu_c": 0.8574452554744525, "calib/mu_w": 0.8533050847457626, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3182745098039215, "calib/std_conf": 0.05701405156735107, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6932137518684603, "calib/step_q_c_n": 669.0, "calib/step_q_gap": 0.0032137518684602906, "calib/step_q_w": 0.6900000000000001, "calib/step_q_w_n": 561.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1771.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 461.6796875, "completions/mean_terminated_length": 461.6796875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.024533333333333334, "grad_norm": 0.008555812761187553, "kl": 0.009517669677734375, "learning_rate": 4.9166666666666665e-06, "loss": 0.0203, "num_tokens": 5342325.0, "reward": 0.5603914260864258, "reward_std": 0.3658100962638855, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6425222754478455, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.17279182374477386, "step": 23 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.569568674219837, "calib/avg_num_step_conf": 5.328125, "calib/ece": 0.3778455284552845, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.14227642276422764, "calib/gap": 0.011920095408467613, "calib/mean_conf": 0.849390243902439, "calib/mu_c": 0.8556410256410256, "calib/mu_w": 0.843720930232558, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3758130081300812, "calib/std_conf": 0.06699133958666083, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6887786259541985, "calib/step_q_c_n": 655.0, "calib/step_q_gap": 0.026282152047287344, "calib/step_q_w": 0.6624964739069111, "calib/step_q_w_n": 709.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2857.0, "completions/max_terminated_length": 2857.0, "completions/mean_length": 557.6875, "completions/mean_terminated_length": 562.0787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.0256, "grad_norm": 0.006697987671941519, "kl": 0.007785320281982422, "learning_rate": 4.888888888888889e-06, "loss": 0.0085, "num_tokens": 5589605.0, "reward": 0.5254364013671875, "reward_std": 0.34020552039146423, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.5884214639663696, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.17807629704475403, "step": 24 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.489, "calib/avg_num_step_conf": 5.1640625, "calib/ece": 0.24513725490196078, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.14901960784313725, "calib/gap": -0.0033225806451614437, "calib/mean_conf": 0.8529803921568627, "calib/mu_c": 0.8516774193548386, "calib/mu_w": 0.8550000000000001, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24513725490196078, "calib/std_conf": 0.057234807110221414, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6781736909323116, "calib/step_q_c_n": 783.0, "calib/step_q_gap": -0.011993284948950023, "calib/step_q_w": 0.6901669758812616, "calib/step_q_w_n": 539.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1672.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 433.8203125, "completions/mean_terminated_length": 435.5216064453125, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.02666666666666667, "grad_norm": 0.007355384062975645, "kl": 0.012590408325195312, "learning_rate": 4.861111111111111e-06, "loss": 0.0228, "num_tokens": 5803887.0, "reward": 0.583240270614624, "reward_std": 0.32405388355255127, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6939566135406494, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.15221142768859863, "step": 25 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.49590108401084015, "calib/avg_num_step_conf": 4.4609375, "calib/ece": 0.2056692913385827, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.10236220472440945, "calib/gap": 0.0048238482384825865, "calib/mean_conf": 0.8475590551181104, "calib/mu_c": 0.849268292682927, "calib/mu_w": 0.8444444444444444, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20377952755905512, "calib/std_conf": 0.06647735432820169, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6948640915593705, "calib/step_q_c_n": 699.0, "calib/step_q_gap": 0.03551871909887372, "calib/step_q_w": 0.6593453724604967, "calib/step_q_w_n": 443.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 453.71875, "completions/mean_terminated_length": 453.71875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.027733333333333332, "grad_norm": 0.007992461323738098, "kl": 0.012160301208496094, "learning_rate": 4.833333333333333e-06, "loss": 0.0226, "num_tokens": 6025279.0, "reward": 0.6579269170761108, "reward_std": 0.3294398784637451, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7225586175918579, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.26673269271850586, "step": 26 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.490538990825688, "calib/avg_num_step_conf": 5.29296875, "calib/ece": 0.2670355731225296, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.05928853754940711, "calib/gap": 0.00446929153924569, "calib/mean_conf": 0.8327272727272728, "calib/mu_c": 0.8346527777777778, "calib/mu_w": 0.8301834862385321, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.26529644268774705, "calib/std_conf": 0.07480193033531987, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6901335311572699, "calib/step_q_c_n": 674.0, "calib/step_q_gap": 0.04452413321307025, "calib/step_q_w": 0.6456093979441997, "calib/step_q_w_n": 681.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2437.0, "completions/max_terminated_length": 2437.0, "completions/mean_length": 445.82421875, "completions/mean_terminated_length": 447.57257080078125, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.0288, "grad_norm": 0.007899126969277859, "kl": 0.01741313934326172, "learning_rate": 4.805555555555556e-06, "loss": -0.0038, "num_tokens": 6244626.0, "reward": 0.6128519177436829, "reward_std": 0.28570079803466797, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.673927366733551, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.24162021279335022, "step": 27 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4634847210541071, "calib/avg_num_step_conf": 4.00390625, "calib/ece": 0.18398406374501997, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.05179282868525897, "calib/gap": 0.0001976450798991669, "calib/mean_conf": 0.8373705179282868, "calib/mu_c": 0.8374390243902439, "calib/mu_w": 0.8372413793103447, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.18398406374501997, "calib/std_conf": 0.0569498465952437, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6852743902439024, "calib/step_q_c_n": 656.0, "calib/step_q_gap": 0.009068428184281774, "calib/step_q_w": 0.6762059620596206, "calib/step_q_w_n": 369.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2803.0, "completions/max_terminated_length": 2803.0, "completions/mean_length": 498.1484375, "completions/mean_terminated_length": 500.10198974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.029866666666666666, "grad_norm": 0.007141755428165197, "kl": 0.012704849243164062, "learning_rate": 4.777777777777778e-06, "loss": 0.0374, "num_tokens": 6479096.0, "reward": 0.6511706113815308, "reward_std": 0.30695879459381104, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7094171047210693, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.2718302011489868, "step": 28 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.3990751156105487, "calib/avg_num_step_conf": 4.64453125, "calib/ece": 0.3420948616600791, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.08695652173913043, "calib/gap": -0.02126546681664787, "calib/mean_conf": 0.8400395256916995, "calib/mu_c": 0.8294488188976378, "calib/mu_w": 0.8507142857142856, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3400790513833992, "calib/std_conf": 0.05838063748347232, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6675422138836773, "calib/step_q_c_n": 533.0, "calib/step_q_gap": -0.006482176360225078, "calib/step_q_w": 0.6740243902439024, "calib/step_q_w_n": 656.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2375.0, "completions/max_terminated_length": 2375.0, "completions/mean_length": 530.0546875, "completions/mean_terminated_length": 530.0546875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.030933333333333334, "grad_norm": 0.0068199350498616695, "kl": 0.014654159545898438, "learning_rate": 4.75e-06, "loss": -0.0016, "num_tokens": 6721918.0, "reward": 0.5134754180908203, "reward_std": 0.3352017402648926, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6030503511428833, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.12936919927597046, "step": 29 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5166521954849914, "calib/avg_num_step_conf": 4.421875, "calib/ece": 0.295529411764706, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.07450980392156863, "calib/gap": 0.007873976680724315, "calib/mean_conf": 0.8367058823529411, "calib/mu_c": 0.8402877697841725, "calib/mu_w": 0.8324137931034482, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2935686274509805, "calib/std_conf": 0.07647716412148525, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6702232854864434, "calib/step_q_c_n": 627.0, "calib/step_q_gap": 0.02962922608050278, "calib/step_q_w": 0.6405940594059406, "calib/step_q_w_n": 505.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2170.0, "completions/max_terminated_length": 2170.0, "completions/mean_length": 512.81640625, "completions/mean_terminated_length": 514.8274536132812, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.032, "grad_norm": 0.0070888507179915905, "kl": 0.014923095703125, "learning_rate": 4.722222222222222e-06, "loss": 0.0171, "num_tokens": 6960183.0, "reward": 0.5999123454093933, "reward_std": 0.3764262795448303, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6610507965087891, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.23174266517162323, "step": 30 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5482947394915682, "calib/avg_num_step_conf": 5.03125, "calib/ece": 0.3725296442687748, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.05533596837944664, "calib/gap": 0.014529952177195882, "calib/mean_conf": 0.8254940711462451, "calib/mu_c": 0.8333620689655171, "calib/mu_w": 0.8188321167883212, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3697628458498024, "calib/std_conf": 0.09492379388062085, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.679337899543379, "calib/step_q_c_n": 438.0, "calib/step_q_gap": 0.07640848777867304, "calib/step_q_w": 0.6029294117647059, "calib/step_q_w_n": 850.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2686.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 518.25, "completions/mean_terminated_length": 520.2824096679688, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.03306666666666667, "grad_norm": 0.0068392446264624596, "kl": 0.017084121704101562, "learning_rate": 4.694444444444445e-06, "loss": 0.0333, "num_tokens": 7198767.0, "reward": 0.5243625640869141, "reward_std": 0.30371779203414917, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6080308556556702, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.15241290628910065, "step": 31 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.48380566801619435, "calib/avg_num_step_conf": 3.69140625, "calib/ece": 0.29552, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.044, "calib/gap": 0.007709016130068913, "calib/mean_conf": 0.82752, "calib/mu_c": 0.8311278195488723, "calib/mu_w": 0.8234188034188034, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.29552, "calib/std_conf": 0.07800672791496897, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.684712389380531, "calib/step_q_c_n": 452.0, "calib/step_q_gap": 0.03771441777809692, "calib/step_q_w": 0.6469979716024341, "calib/step_q_w_n": 493.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2279.0, "completions/max_terminated_length": 2279.0, "completions/mean_length": 433.984375, "completions/mean_terminated_length": 440.873046875, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.034133333333333335, "grad_norm": 0.009558629244565964, "kl": 0.040973663330078125, "learning_rate": 4.666666666666667e-06, "loss": 0.0403, "num_tokens": 7416571.0, "reward": 0.5686406493186951, "reward_std": 0.33148401975631714, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6421367526054382, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.19670704007148743, "step": 32 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.48044094488188976, "calib/avg_num_step_conf": 3.984375, "calib/ece": 0.35210317460317464, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.05952380952380952, "calib/gap": -0.005273070866141594, "calib/mean_conf": 0.8428174603174603, "calib/mu_c": 0.84016, "calib/mu_w": 0.8454330708661416, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34944444444444445, "calib/std_conf": 0.05107182940358978, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7085355648535564, "calib/step_q_c_n": 478.0, "calib/step_q_gap": 0.009974679244700302, "calib/step_q_w": 0.6985608856088561, "calib/step_q_w_n": 542.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2483.0, "completions/max_terminated_length": 2483.0, "completions/mean_length": 466.28125, "completions/mean_terminated_length": 468.1098327636719, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.0352, "grad_norm": 0.007306600920855999, "kl": 0.019073486328125, "learning_rate": 4.638888888888889e-06, "loss": 0.0535, "num_tokens": 7642811.0, "reward": 0.5501465797424316, "reward_std": 0.3264395594596863, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.6147527694702148, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.19100919365882874, "step": 33 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.429690027826312, "calib/avg_num_step_conf": 4.3203125, "calib/ece": 0.2474803149606299, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.07086614173228346, "calib/gap": -0.01012618908949714, "calib/mean_conf": 0.8367716535433071, "calib/mu_c": 0.8327450980392157, "calib/mu_w": 0.8428712871287128, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24094488188976376, "calib/std_conf": 0.08078241300924664, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6851067073170731, "calib/step_q_c_n": 656.0, "calib/step_q_gap": -0.019315514905149156, "calib/step_q_w": 0.7044222222222223, "calib/step_q_w_n": 450.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1928.0, "completions/max_terminated_length": 1928.0, "completions/mean_length": 422.76953125, "completions/mean_terminated_length": 424.427490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.03626666666666667, "grad_norm": 0.09493603557348251, "kl": 0.18951797485351562, "learning_rate": 4.611111111111112e-06, "loss": 0.0335, "num_tokens": 7856152.0, "reward": 0.672420859336853, "reward_std": 0.3296073079109192, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6849804520606995, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.34267374873161316, "step": 34 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4924242424242424, "calib/avg_num_step_conf": 3.703125, "calib/ece": 0.32976190476190465, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.05952380952380952, "calib/gap": 0.000606060606060721, "calib/mean_conf": 0.8453174603174602, "calib/mu_c": 0.8456060606060607, "calib/mu_w": 0.845, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3256349206349205, "calib/std_conf": 0.05436044471033665, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7239529914529914, "calib/step_q_c_n": 468.0, "calib/step_q_gap": 0.02534882478632483, "calib/step_q_w": 0.6986041666666666, "calib/step_q_w_n": 480.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2316.0, "completions/max_terminated_length": 2316.0, "completions/mean_length": 506.8046875, "completions/mean_terminated_length": 508.79217529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.037333333333333336, "grad_norm": 0.0072205690667033195, "kl": 0.01589202880859375, "learning_rate": 4.583333333333333e-06, "loss": 0.0687, "num_tokens": 8095150.0, "reward": 0.5845861434936523, "reward_std": 0.3286598026752472, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.634475827217102, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.23469656705856323, "step": 35 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5039518900343642, "calib/avg_num_step_conf": 3.546875, "calib/ece": 0.07807086614173235, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.06299212598425197, "calib/gap": 0.0015498281786939794, "calib/mean_conf": 0.8418503937007873, "calib/mu_c": 0.8422164948453608, "calib/mu_w": 0.8406666666666668, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.07807086614173235, "calib/std_conf": 0.049780238108552796, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7303614457831324, "calib/step_q_c_n": 664.0, "calib/step_q_gap": 0.056386035947066815, "calib/step_q_w": 0.6739754098360656, "calib/step_q_w_n": 244.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 439.39453125, "completions/mean_terminated_length": 439.39453125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.0384, "grad_norm": 0.007099926937371492, "kl": 0.022426605224609375, "learning_rate": 4.555555555555556e-06, "loss": -0.0302, "num_tokens": 8310347.0, "reward": 0.7966442108154297, "reward_std": 0.2431754469871521, "rewards/accuracy_reward_step": 0.7578125, "rewards/final_brier_reward_step": 0.8040753602981567, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.439994215965271, "step": 36 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.497625643055006, "calib/avg_num_step_conf": 3.6875, "calib/ece": 0.3941295546558704, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.0931174089068826, "calib/gap": -0.007568922305764447, "calib/mean_conf": 0.844251012145749, "calib/mu_c": 0.8401754385964912, "calib/mu_w": 0.8477443609022557, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3884210526315789, "calib/std_conf": 0.06419972748460089, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7431695331695333, "calib/step_q_c_n": 407.0, "calib/step_q_gap": 0.05086040840230799, "calib/step_q_w": 0.6923091247672253, "calib/step_q_w_n": 537.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2723.0, "completions/max_terminated_length": 2723.0, "completions/mean_length": 489.390625, "completions/mean_terminated_length": 493.24407958984375, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.039466666666666664, "grad_norm": 0.009426533244550228, "kl": 0.030477523803710938, "learning_rate": 4.527777777777778e-06, "loss": 0.0313, "num_tokens": 8542727.0, "reward": 0.4761776626110077, "reward_std": 0.3279336094856262, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.5761339664459229, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.0941900834441185, "step": 37 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.45890324214792294, "calib/avg_num_step_conf": 3.52734375, "calib/ece": 0.2967588932806325, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.10276679841897234, "calib/gap": -0.00219478216818636, "calib/mean_conf": 0.8432411067193675, "calib/mu_c": 0.8422695035460993, "calib/mu_w": 0.8444642857142857, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2913438735177866, "calib/std_conf": 0.07007455917688288, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7394324853228962, "calib/step_q_c_n": 511.0, "calib/step_q_gap": 1.9220016773791038e-05, "calib/step_q_w": 0.7394132653061224, "calib/step_q_w_n": 392.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2769.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 449.6328125, "completions/mean_terminated_length": 451.3961181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.04053333333333333, "grad_norm": 0.007991362363100052, "kl": 0.023359298706054688, "learning_rate": 4.5e-06, "loss": 0.0814, "num_tokens": 8764721.0, "reward": 0.5837390422821045, "reward_std": 0.35104483366012573, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6577367186546326, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.20192888379096985, "step": 38 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.3888250319284803, "calib/avg_num_step_conf": 3.73828125, "calib/ece": 0.28853754940711457, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.1383399209486166, "calib/gap": -0.023364623243933624, "calib/mean_conf": 0.8511462450592886, "calib/mu_c": 0.8411724137931035, "calib/mu_w": 0.8645370370370371, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28328063241106716, "calib/std_conf": 0.06406418437084797, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7339884393063584, "calib/step_q_c_n": 519.0, "calib/step_q_gap": -0.00384261092195215, "calib/step_q_w": 0.7378310502283105, "calib/step_q_w_n": 438.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2717.0, "completions/max_terminated_length": 2717.0, "completions/mean_length": 462.96484375, "completions/mean_terminated_length": 462.96484375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.0416, "grad_norm": 0.006961394567042589, "kl": 0.02230072021484375, "learning_rate": 4.472222222222223e-06, "loss": -0.0099, "num_tokens": 8989328.0, "reward": 0.5831207036972046, "reward_std": 0.32755035161972046, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6547492742538452, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.1997734010219574, "step": 39 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5232810465620931, "calib/avg_num_step_conf": 3.46875, "calib/ece": 0.35177165354330703, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.1062992125984252, "calib/gap": 0.01598425196850395, "calib/mean_conf": 0.8517716535433071, "calib/mu_c": 0.8597637795275591, "calib/mu_w": 0.8437795275590552, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.35177165354330703, "calib/std_conf": 0.0795979749847648, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7475586854460095, "calib/step_q_c_n": 426.0, "calib/step_q_gap": 0.0009136638009876741, "calib/step_q_w": 0.7466450216450218, "calib/step_q_w_n": 462.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 469.97265625, "completions/mean_terminated_length": 469.97265625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.042666666666666665, "grad_norm": 0.008221627213060856, "kl": 0.03347587585449219, "learning_rate": 4.444444444444444e-06, "loss": 0.0195, "num_tokens": 9216401.0, "reward": 0.5157410502433777, "reward_std": 0.3499770760536194, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6230074167251587, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.11081837117671967, "step": 40 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5269940314704287, "calib/avg_num_step_conf": 3.5625, "calib/ece": 0.08657370517928295, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.0796812749003984, "calib/gap": 0.003040332790739786, "calib/mean_conf": 0.8432270916334661, "calib/mu_c": 0.8439175257731959, "calib/mu_w": 0.8408771929824561, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.07844621513944232, "calib/std_conf": 0.0825462819782565, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7594736842105264, "calib/step_q_c_n": 722.0, "calib/step_q_gap": 0.006631578947368433, "calib/step_q_w": 0.752842105263158, "calib/step_q_w_n": 190.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2519.0, "completions/max_terminated_length": 2519.0, "completions/mean_length": 450.29296875, "completions/mean_terminated_length": 450.29296875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.04373333333333333, "grad_norm": 0.0071234856732189655, "kl": 0.02417755126953125, "learning_rate": 4.416666666666667e-06, "loss": 0.0529, "num_tokens": 9438924.0, "reward": 0.7837193012237549, "reward_std": 0.34599435329437256, "rewards/accuracy_reward_step": 0.7578125, "rewards/final_brier_reward_step": 0.7970812320709229, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.4234822392463684, "step": 41 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5055284857571214, "calib/avg_num_step_conf": 3.77734375, "calib/ece": 0.31543307086614175, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.1220472440944882, "calib/gap": -0.001366816591704123, "calib/mean_conf": 0.858740157480315, "calib/mu_c": 0.8581159420289856, "calib/mu_w": 0.8594827586206897, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31543307086614175, "calib/std_conf": 0.04721357281802466, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7388235294117648, "calib/step_q_c_n": 493.0, "calib/step_q_gap": -0.02081782079920569, "calib/step_q_w": 0.7596413502109705, "calib/step_q_w_n": 474.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2729.0, "completions/max_terminated_length": 2729.0, "completions/mean_length": 416.92578125, "completions/mean_terminated_length": 416.92578125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.0448, "grad_norm": 0.007271526847034693, "kl": 0.020999908447265625, "learning_rate": 4.388888888888889e-06, "loss": 0.0215, "num_tokens": 9650025.0, "reward": 0.5809261798858643, "reward_std": 0.33942875266075134, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6443961262702942, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.21120622754096985, "step": 42 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5182995495495495, "calib/avg_num_step_conf": 3.4375, "calib/ece": 0.29317647058823526, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.19607843137254902, "calib/gap": 0.00631193693693699, "calib/mean_conf": 0.8499607843137256, "calib/mu_c": 0.8527083333333333, "calib/mu_w": 0.8463963963963963, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2892156862745098, "calib/std_conf": 0.08366364984428545, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7397265625, "calib/step_q_c_n": 512.0, "calib/step_q_gap": -0.005028872282608621, "calib/step_q_w": 0.7447554347826086, "calib/step_q_w_n": 368.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 472.7890625, "completions/mean_terminated_length": 474.6431579589844, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.04586666666666667, "grad_norm": 0.006691481452435255, "kl": 0.022130966186523438, "learning_rate": 4.361111111111112e-06, "loss": -0.0062, "num_tokens": 9876283.0, "reward": 0.5755380988121033, "reward_std": 0.38558706641197205, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6614230871200562, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.17949683964252472, "step": 43 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5408879969898408, "calib/avg_num_step_conf": 4.0234375, "calib/ece": 0.3985770750988141, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.22529644268774704, "calib/gap": 0.012836448012040536, "calib/mean_conf": 0.8638735177865613, "calib/mu_c": 0.8706722689075629, "calib/mu_w": 0.8578358208955223, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39604743083003946, "calib/std_conf": 0.07086155160819461, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7696687370600414, "calib/step_q_c_n": 483.0, "calib/step_q_gap": 0.012155025908304529, "calib/step_q_w": 0.7575137111517368, "calib/step_q_w_n": 547.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1478.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 499.76953125, "completions/mean_terminated_length": 505.6956787109375, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.046933333333333334, "grad_norm": 0.00664907181635499, "kl": 0.019491195678710938, "learning_rate": 4.333333333333334e-06, "loss": -0.0114, "num_tokens": 10110544.0, "reward": 0.4891369342803955, "reward_std": 0.327079176902771, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.5903961062431335, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.09725279361009598, "step": 44 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5010208113804004, "calib/avg_num_step_conf": 3.9453125, "calib/ece": 0.27428, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.196, "calib/gap": 0.007297154899894465, "calib/mean_conf": 0.8558000000000001, "calib/mu_c": 0.858835616438356, "calib/mu_w": 0.8515384615384616, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.27304, "calib/std_conf": 0.07442822045434111, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7594680851063831, "calib/step_q_c_n": 564.0, "calib/step_q_gap": -0.0038951435931684486, "calib/step_q_w": 0.7633632286995515, "calib/step_q_w_n": 446.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2729.0, "completions/max_terminated_length": 2729.0, "completions/mean_length": 502.17578125, "completions/mean_terminated_length": 506.1299133300781, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.048, "grad_norm": 0.0068217390216887, "kl": 0.022533416748046875, "learning_rate": 4.305555555555556e-06, "loss": 0.0349, "num_tokens": 10344149.0, "reward": 0.6221271753311157, "reward_std": 0.4009205996990204, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6636918187141418, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.271968811750412, "step": 45 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.48097999235376576, "calib/avg_num_step_conf": 4.1953125, "calib/ece": 0.3437051792828685, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.1952191235059761, "calib/gap": -0.01948515356187075, "calib/mean_conf": 0.845776892430279, "calib/mu_c": 0.8366165413533835, "calib/mu_w": 0.8561016949152542, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.329800796812749, "calib/std_conf": 0.08657547443019373, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7438319327731092, "calib/step_q_c_n": 595.0, "calib/step_q_gap": -0.014038630901212268, "calib/step_q_w": 0.7578705636743215, "calib/step_q_w_n": 479.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 548.8671875, "completions/mean_terminated_length": 548.8671875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.04906666666666667, "grad_norm": 0.006014056038111448, "kl": 0.023838043212890625, "learning_rate": 4.277777777777778e-06, "loss": 0.0181, "num_tokens": 10589427.0, "reward": 0.529619038105011, "reward_std": 0.39823025465011597, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6215183734893799, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.13771970570087433, "step": 46 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4879260429003816, "calib/avg_num_step_conf": 3.88671875, "calib/ece": 0.2801195219123506, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.27091633466135456, "calib/gap": -0.003914330833004409, "calib/mean_conf": 0.865617529880478, "calib/mu_c": 0.8640268456375838, "calib/mu_w": 0.8679411764705882, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.27605577689243027, "calib/std_conf": 0.0657406252801508, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.75665568369028, "calib/step_q_c_n": 607.0, "calib/step_q_gap": -0.009272151361266356, "calib/step_q_w": 0.7659278350515464, "calib/step_q_w_n": 388.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 553.3203125, "completions/mean_terminated_length": 555.490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.050133333333333335, "grad_norm": 0.006215677130967379, "kl": 0.0251312255859375, "learning_rate": 4.25e-06, "loss": -0.0096, "num_tokens": 10837053.0, "reward": 0.6104649305343628, "reward_std": 0.33931007981300354, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6642383337020874, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.24419158697128296, "step": 47 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.460375, "calib/avg_num_step_conf": 3.89453125, "calib/ece": 0.36201581027667984, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.23715415019762845, "calib/gap": -0.012888124999999917, "calib/mean_conf": 0.8600395256916995, "calib/mu_c": 0.8536718750000001, "calib/mu_w": 0.86656, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3580632411067194, "calib/std_conf": 0.08404072993211129, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.757972166998012, "calib/step_q_c_n": 503.0, "calib/step_q_gap": -0.0043962540546196305, "calib/step_q_w": 0.7623684210526316, "calib/step_q_w_n": 494.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2845.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 506.6171875, "completions/mean_terminated_length": 506.6171875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.0512, "grad_norm": 0.006817369256168604, "kl": 0.028110504150390625, "learning_rate": 4.222222222222223e-06, "loss": 0.0049, "num_tokens": 11070435.0, "reward": 0.5294307470321655, "reward_std": 0.3830631971359253, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6033015847206116, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.1586848795413971, "step": 48 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4975291695264242, "calib/avg_num_step_conf": 3.94921875, "calib/ece": 0.2572289156626505, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.2931726907630522, "calib/gap": 0.0009238160603980416, "calib/mean_conf": 0.8724899598393574, "calib/mu_c": 0.8728387096774193, "calib/mu_w": 0.8719148936170212, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.2536144578313252, "calib/std_conf": 0.06470371847379203, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.7685365853658537, "calib/step_q_c_n": 656.0, "calib/step_q_gap": -0.0008436963242871576, "calib/step_q_w": 0.7693802816901408, "calib/step_q_w_n": 355.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2730.0, "completions/max_terminated_length": 2730.0, "completions/mean_length": 562.52734375, "completions/mean_terminated_length": 564.7333374023438, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.05226666666666667, "grad_norm": 0.006192709319293499, "kl": 0.030986785888671875, "learning_rate": 4.194444444444445e-06, "loss": 0.0284, "num_tokens": 11318978.0, "reward": 0.6265689134597778, "reward_std": 0.34821343421936035, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6752644777297974, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.2653733491897583, "step": 49 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5368070175438596, "calib/avg_num_step_conf": 4.0234375, "calib/ece": 0.2666122448979591, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.35918367346938773, "calib/gap": 0.004743859649122828, "calib/mean_conf": 0.8746938775510205, "calib/mu_c": 0.8765333333333333, "calib/mu_w": 0.8717894736842104, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2645306122448978, "calib/std_conf": 0.0647791236470115, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7780928689883914, "calib/step_q_c_n": 603.0, "calib/step_q_gap": 0.012519098496588166, "calib/step_q_w": 0.7655737704918032, "calib/step_q_w_n": 427.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2963.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 601.80078125, "completions/mean_terminated_length": 604.1608276367188, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.05333333333333334, "grad_norm": 0.22164776921272278, "kl": 1.2020721435546875, "learning_rate": 4.166666666666667e-06, "loss": 0.0174, "num_tokens": 11578399.0, "reward": 0.5997195243835449, "reward_std": 0.3739992380142212, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6526364088058472, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.24055272340774536, "step": 50 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.533546752034147, "calib/avg_num_step_conf": 4.1015625, "calib/ece": 0.27466135458167307, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.398406374501992, "calib/gap": 0.01503067893824206, "calib/mean_conf": 0.8717131474103587, "calib/mu_c": 0.8775816993464051, "calib/mu_w": 0.862551020408163, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.26840637450199184, "calib/std_conf": 0.09838057955065771, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.7840649149922719, "calib/step_q_c_n": 647.0, "calib/step_q_gap": 0.02436268174165157, "calib/step_q_w": 0.7597022332506204, "calib/step_q_w_n": 403.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 565.29296875, "completions/mean_terminated_length": 567.5098266601562, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.0544, "grad_norm": 0.00638164347037673, "kl": 0.03388214111328125, "learning_rate": 4.138888888888889e-06, "loss": 0.0221, "num_tokens": 11832410.0, "reward": 0.6391332149505615, "reward_std": 0.3710130453109741, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6637433767318726, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.3035855293273926, "step": 51 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4566133720930233, "calib/avg_num_step_conf": 4.16796875, "calib/ece": 0.20392857142857143, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4087301587301587, "calib/gap": 0.008645348837209244, "calib/mean_conf": 0.8661507936507936, "calib/mu_c": 0.8688953488372092, "calib/mu_w": 0.86025, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.19376984126984126, "calib/std_conf": 0.12019769214832211, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7663157894736842, "calib/step_q_c_n": 722.0, "calib/step_q_gap": -0.016843630816170907, "calib/step_q_w": 0.7831594202898551, "calib/step_q_w_n": 345.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2614.0, "completions/max_terminated_length": 2614.0, "completions/mean_length": 549.65625, "completions/mean_terminated_length": 549.65625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.055466666666666664, "grad_norm": 0.034107550978660583, "kl": 0.16524124145507812, "learning_rate": 4.111111111111111e-06, "loss": 0.0429, "num_tokens": 12081074.0, "reward": 0.7069775462150574, "reward_std": 0.37056323885917664, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7231961488723755, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.3602902293205261, "step": 52 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4809523809523809, "calib/avg_num_step_conf": 4.29296875, "calib/ece": 0.2893650793650794, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.38095238095238093, "calib/gap": -0.0012517006802721387, "calib/mean_conf": 0.8726984126984126, "calib/mu_c": 0.8721768707482992, "calib/mu_w": 0.8734285714285713, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2893650793650794, "calib/std_conf": 0.0869812304001554, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7642366612111293, "calib/step_q_c_n": 611.0, "calib/step_q_gap": -0.0051280928872314835, "calib/step_q_w": 0.7693647540983608, "calib/step_q_w_n": 488.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2203.0, "completions/max_terminated_length": 2203.0, "completions/mean_length": 588.453125, "completions/mean_terminated_length": 588.453125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.05653333333333333, "grad_norm": 0.004999403841793537, "kl": 0.0465087890625, "learning_rate": 4.083333333333334e-06, "loss": 0.0229, "num_tokens": 12337542.0, "reward": 0.612328052520752, "reward_std": 0.3517555594444275, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6539046764373779, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.25981390476226807, "step": 53 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.513514500693988, "calib/avg_num_step_conf": 4.27734375, "calib/ece": 0.20884, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.496, "calib/gap": 0.012274819197896303, "calib/mean_conf": 0.88052, "calib/mu_c": 0.8844970414201184, "calib/mu_w": 0.8722222222222221, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20668, "calib/std_conf": 0.09331307303909778, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7815868673050614, "calib/step_q_c_n": 731.0, "calib/step_q_gap": 0.015542911261105297, "calib/step_q_w": 0.7660439560439561, "calib/step_q_w_n": 364.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1841.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 509.8671875, "completions/mean_terminated_length": 513.8818969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.0576, "grad_norm": 0.0054353284649550915, "kl": 0.06253814697265625, "learning_rate": 4.055555555555556e-06, "loss": -0.0021, "num_tokens": 12574300.0, "reward": 0.6638885736465454, "reward_std": 0.40448057651519775, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7185714840888977, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.28108060359954834, "step": 54 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.47657157609397177, "calib/avg_num_step_conf": 4.203125, "calib/ece": 0.3573493975903614, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.42168674698795183, "calib/gap": 0.019470117464825076, "calib/mean_conf": 0.8673895582329318, "calib/mu_c": 0.8769291338582677, "calib/mu_w": 0.8574590163934426, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3573493975903614, "calib/std_conf": 0.11835219397493374, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7916758747697974, "calib/step_q_c_n": 543.0, "calib/step_q_gap": 0.0026890079780526044, "calib/step_q_w": 0.7889868667917448, "calib/step_q_w_n": 533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2484.0, "completions/max_terminated_length": 2484.0, "completions/mean_length": 558.59765625, "completions/mean_terminated_length": 558.59765625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.058666666666666666, "grad_norm": 0.005009907763451338, "kl": 0.06207275390625, "learning_rate": 4.027777777777779e-06, "loss": 0.0253, "num_tokens": 12825125.0, "reward": 0.511044979095459, "reward_std": 0.39973604679107666, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.597327709197998, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.13179343938827515, "step": 55 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.4796370967741936, "calib/avg_num_step_conf": 4.37109375, "calib/ece": 0.39790983606557373, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.4918032786885246, "calib/gap": -0.00955376344086023, "calib/mean_conf": 0.8861885245901638, "calib/mu_c": 0.8813333333333333, "calib/mu_w": 0.8908870967741935, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.39614754098360655, "calib/std_conf": 0.07968573848246181, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.7779211469534051, "calib/step_q_c_n": 558.0, "calib/step_q_gap": -0.016517355720391702, "calib/step_q_w": 0.7944385026737968, "calib/step_q_w_n": 561.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2506.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 581.78125, "completions/mean_terminated_length": 586.3621826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.05973333333333333, "grad_norm": 0.004738733638077974, "kl": 0.06552886962890625, "learning_rate": 4.000000000000001e-06, "loss": -0.0124, "num_tokens": 13080901.0, "reward": 0.49623072147369385, "reward_std": 0.3822718858718872, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.5537382960319519, "rewards/format_reward_step": 0.9375, "rewards/step_correlation_reward": 0.1574731469154358, "step": 56 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5611590972415715, "calib/avg_num_step_conf": 4.515625, "calib/ece": 0.279469387755102, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.49795918367346936, "calib/gap": 0.040534271384786935, "calib/mean_conf": 0.8768571428571428, "calib/mu_c": 0.8929054054054054, "calib/mu_w": 0.8523711340206185, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.27612244897959176, "calib/std_conf": 0.12209531238390132, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.781980056980057, "calib/step_q_c_n": 702.0, "calib/step_q_gap": 0.006032920416180421, "calib/step_q_w": 0.7759471365638766, "calib/step_q_w_n": 454.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2863.0, "completions/max_terminated_length": 2863.0, "completions/mean_length": 599.22265625, "completions/mean_terminated_length": 601.5725708007812, "completions/min_length": 0.0, "completions/min_terminated_length": 262.0, "epoch": 0.0608, "grad_norm": 0.0041455053724348545, "kl": 0.066436767578125, "learning_rate": 3.972222222222223e-06, "loss": 0.0291, "num_tokens": 13341094.0, "reward": 0.5770140290260315, "reward_std": 0.38654983043670654, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6612206697463989, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.18421359360218048, "step": 57 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4436655783880673, "calib/avg_num_step_conf": 4.8203125, "calib/ece": 0.39148000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.504, "calib/gap": -0.005337046283848679, "calib/mean_conf": 0.87076, "calib/mu_c": 0.8680487804878048, "calib/mu_w": 0.8733858267716534, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3851200000000001, "calib/std_conf": 0.12297569841232861, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7651885098743268, "calib/step_q_c_n": 557.0, "calib/step_q_gap": 0.028526766890574984, "calib/step_q_w": 0.7366617429837519, "calib/step_q_w_n": 677.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2777.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 648.44921875, "completions/mean_terminated_length": 648.44921875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.06186666666666667, "grad_norm": 0.004322744905948639, "kl": 0.07526397705078125, "learning_rate": 3.944444444444445e-06, "loss": 0.001, "num_tokens": 13613417.0, "reward": 0.4562169909477234, "reward_std": 0.46282821893692017, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.5709011554718018, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.05168907344341278, "step": 58 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5150702844066688, "calib/avg_num_step_conf": 4.3046875, "calib/ece": 0.3415725806451612, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.46774193548387094, "calib/gap": 0.01627394573389984, "calib/mean_conf": 0.8668145161290324, "calib/mu_c": 0.874360902255639, "calib/mu_w": 0.8580869565217392, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.33604838709677415, "calib/std_conf": 0.12877185795889265, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7930398671096346, "calib/step_q_c_n": 602.0, "calib/step_q_gap": 0.01533986710963453, "calib/step_q_w": 0.7777000000000001, "calib/step_q_w_n": 500.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2536.0, "completions/max_terminated_length": 2536.0, "completions/mean_length": 580.48828125, "completions/mean_terminated_length": 582.7647705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.06293333333333333, "grad_norm": 0.004097119905054569, "kl": 0.078094482421875, "learning_rate": 3.916666666666667e-06, "loss": 0.0634, "num_tokens": 13868270.0, "reward": 0.5046218633651733, "reward_std": 0.38234949111938477, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.609442949295044, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.10448820888996124, "step": 59 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.43834448379902924, "calib/avg_num_step_conf": 4.22265625, "calib/ece": 0.3957085020242915, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.44534412955465585, "calib/gap": -0.021629935720844706, "calib/mean_conf": 0.8699595141700405, "calib/mu_c": 0.8589256198347107, "calib/mu_w": 0.8805555555555554, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.38789473684210524, "calib/std_conf": 0.11143705391107282, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.7760658914728683, "calib/step_q_c_n": 516.0, "calib/step_q_gap": 0.014685360499416933, "calib/step_q_w": 0.7613805309734514, "calib/step_q_w_n": 565.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2316.0, "completions/max_terminated_length": 2316.0, "completions/mean_length": 561.9765625, "completions/mean_terminated_length": 566.4015502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 235.0, "epoch": 0.064, "grad_norm": 0.004728895146399736, "kl": 0.0794830322265625, "learning_rate": 3.88888888888889e-06, "loss": 0.0085, "num_tokens": 14120992.0, "reward": 0.45893681049346924, "reward_std": 0.4327120780944824, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.5494043231010437, "rewards/format_reward_step": 0.94921875, "rewards/step_correlation_reward": 0.08331304788589478, "step": 60 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.45793163891323396, "calib/avg_num_step_conf": 4.95703125, "calib/ece": 0.2676377952755905, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4881889763779528, "calib/gap": -0.01663183442324523, "calib/mean_conf": 0.8555905511811024, "calib/mu_c": 0.8496319018404909, "calib/mu_w": 0.8662637362637361, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.24074803149606291, "calib/std_conf": 0.15307428870805093, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7778206896551725, "calib/step_q_c_n": 725.0, "calib/step_q_gap": 0.089548630831643, "calib/step_q_w": 0.6882720588235295, "calib/step_q_w_n": 544.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2716.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 496.93359375, "completions/mean_terminated_length": 496.93359375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.06506666666666666, "grad_norm": 0.0047287000343203545, "kl": 0.08477783203125, "learning_rate": 3.861111111111112e-06, "loss": 0.0132, "num_tokens": 14352271.0, "reward": 0.6241191625595093, "reward_std": 0.40424540638923645, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6815304756164551, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.24327021837234497, "step": 61 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.49229752066115706, "calib/avg_num_step_conf": 4.12109375, "calib/ece": 0.3784959349593495, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.4349593495934959, "calib/gap": 0.0009712396694213643, "calib/mean_conf": 0.8573577235772356, "calib/mu_c": 0.8578512396694215, "calib/mu_w": 0.8568800000000001, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.37199186991869915, "calib/std_conf": 0.11819411002075521, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7533976833976834, "calib/step_q_c_n": 518.0, "calib/step_q_gap": 0.005818540008484141, "calib/step_q_w": 0.7475791433891993, "calib/step_q_w_n": 537.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2563.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 572.79296875, "completions/mean_terminated_length": 579.5850219726562, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.06613333333333334, "grad_norm": 0.004045703914016485, "kl": 0.08335113525390625, "learning_rate": 3.833333333333334e-06, "loss": 0.0174, "num_tokens": 14605986.0, "reward": 0.49485641717910767, "reward_std": 0.44042736291885376, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.571002721786499, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.1351163387298584, "step": 62 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5197957839262187, "calib/avg_num_step_conf": 4.35546875, "calib/ece": 0.3272874493927127, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.4534412955465587, "calib/gap": 0.016361660079051377, "calib/mean_conf": 0.8617004048582996, "calib/mu_c": 0.8693181818181818, "calib/mu_w": 0.8529565217391304, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.3272874493927127, "calib/std_conf": 0.11042585637848189, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.7688871054039593, "calib/step_q_c_n": 623.0, "calib/step_q_gap": -0.003064114108235816, "calib/step_q_w": 0.7719512195121951, "calib/step_q_w_n": 492.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2289.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 584.5234375, "completions/mean_terminated_length": 586.8157348632812, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.0672, "grad_norm": 0.004193742293864489, "kl": 0.0828399658203125, "learning_rate": 3.8055555555555556e-06, "loss": 0.025, "num_tokens": 14864264.0, "reward": 0.5371720790863037, "reward_std": 0.41586440801620483, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6058300733566284, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.17632651329040527, "step": 63 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6001828277356446, "calib/avg_num_step_conf": 4.4453125, "calib/ece": 0.2853658536585365, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4105691056910569, "calib/gap": 0.04562296858071502, "calib/mean_conf": 0.8447967479674796, "calib/mu_c": 0.8640845070422535, "calib/mu_w": 0.8184615384615385, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.27646341463414625, "calib/std_conf": 0.1395629041581844, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.770624, "calib/step_q_c_n": 625.0, "calib/step_q_gap": 0.027953434697855717, "calib/step_q_w": 0.7426705653021443, "calib/step_q_w_n": 513.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2935.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 550.22265625, "completions/mean_terminated_length": 558.9563598632812, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.06826666666666667, "grad_norm": 0.00438300147652626, "kl": 0.090789794921875, "learning_rate": 3.777777777777778e-06, "loss": 0.0226, "num_tokens": 15108897.0, "reward": 0.587658703327179, "reward_std": 0.45948439836502075, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6535687446594238, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.2201860547065735, "step": 64 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5798196596393193, "calib/avg_num_step_conf": 3.9296875, "calib/ece": 0.3533864541832669, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.4063745019920319, "calib/gap": 0.05891605283210555, "calib/mean_conf": 0.8443027888446216, "calib/mu_c": 0.8741129032258064, "calib/mu_w": 0.8151968503937008, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3518326693227091, "calib/std_conf": 0.1543622400316892, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7706299212598425, "calib/step_q_c_n": 508.0, "calib/step_q_gap": 0.032296587926509246, "calib/step_q_w": 0.7383333333333333, "calib/step_q_w_n": 498.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1564.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 457.5234375, "completions/mean_terminated_length": 459.31768798828125, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.06933333333333333, "grad_norm": 0.004640705417841673, "kl": 0.093994140625, "learning_rate": 3.7500000000000005e-06, "loss": 0.0027, "num_tokens": 15331047.0, "reward": 0.5712149143218994, "reward_std": 0.37106049060821533, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6176152229309082, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.23418954014778137, "step": 65 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5258597883597884, "calib/avg_num_step_conf": 4.37890625, "calib/ece": 0.39056680161943313, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.3603238866396761, "calib/gap": 0.021560846560846558, "calib/mean_conf": 0.8360728744939272, "calib/mu_c": 0.847857142857143, "calib/mu_w": 0.8262962962962964, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.38659919028340073, "calib/std_conf": 0.15139955300073482, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7475578947368421, "calib/step_q_c_n": 475.0, "calib/step_q_gap": 0.006187925696594432, "calib/step_q_w": 0.7413699690402477, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 595.5625, "completions/mean_terminated_length": 597.8980712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0704, "grad_norm": 0.004484922159463167, "kl": 0.0873870849609375, "learning_rate": 3.7222222222222225e-06, "loss": -0.0164, "num_tokens": 15589863.0, "reward": 0.45835405588150024, "reward_std": 0.4077296555042267, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.568659782409668, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.06914205849170685, "step": 66 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5516579183203175, "calib/avg_num_step_conf": 4.375, "calib/ece": 0.2704347826086956, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.33201581027667987, "calib/gap": 0.040174753552682296, "calib/mean_conf": 0.8425296442687746, "calib/mu_c": 0.8595205479452054, "calib/mu_w": 0.8193457943925231, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2679446640316205, "calib/std_conf": 0.1380751503527436, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7760596923076923, "calib/step_q_c_n": 650.0, "calib/step_q_gap": 0.04142139443535198, "calib/step_q_w": 0.7346382978723404, "calib/step_q_w_n": 470.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2858.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 524.83203125, "completions/mean_terminated_length": 524.83203125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.07146666666666666, "grad_norm": 0.004559084307402372, "kl": 0.0973663330078125, "learning_rate": 3.694444444444445e-06, "loss": 0.0053, "num_tokens": 15829228.0, "reward": 0.6144473552703857, "reward_std": 0.36117982864379883, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.669489860534668, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.25002986192703247, "step": 67 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5365545401516624, "calib/avg_num_step_conf": 4.265625, "calib/ece": 0.3240799999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.348, "calib/gap": -0.0013040378508003547, "calib/mean_conf": 0.8343200000000001, "calib/mu_c": 0.8337410071942447, "calib/mu_w": 0.8350450450450451, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.3011999999999999, "calib/std_conf": 0.15953349993026544, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7423938879456706, "calib/step_q_c_n": 589.0, "calib/step_q_gap": 0.0046801702518335375, "calib/step_q_w": 0.7377137176938371, "calib/step_q_w_n": 503.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2428.0, "completions/max_terminated_length": 2428.0, "completions/mean_length": 521.02734375, "completions/mean_terminated_length": 521.02734375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.07253333333333334, "grad_norm": 0.004508777987211943, "kl": 0.10665130615234375, "learning_rate": 3.6666666666666666e-06, "loss": 0.0179, "num_tokens": 16066699.0, "reward": 0.5404424071311951, "reward_std": 0.3533206582069397, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.626226544380188, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.15231452882289886, "step": 68 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4674674674674675, "calib/avg_num_step_conf": 3.95703125, "calib/ece": 0.3839430894308944, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.27235772357723576, "calib/gap": -0.005239239239239191, "calib/mean_conf": 0.8054878048780488, "calib/mu_c": 0.8026126126126126, "calib/mu_w": 0.8078518518518518, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.3691056910569107, "calib/std_conf": 0.18090885572276796, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.7343735224586289, "calib/step_q_c_n": 423.0, "calib/step_q_gap": -0.011355291100693221, "calib/step_q_w": 0.7457288135593221, "calib/step_q_w_n": 590.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2557.0, "completions/max_terminated_length": 2557.0, "completions/mean_length": 588.40234375, "completions/mean_terminated_length": 590.7098388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.0736, "grad_norm": 0.004546022973954678, "kl": 0.09192657470703125, "learning_rate": 3.638888888888889e-06, "loss": 0.0411, "num_tokens": 16321826.0, "reward": 0.4601125717163086, "reward_std": 0.37661075592041016, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.5603660345077515, "rewards/format_reward_step": 0.94921875, "rewards/step_correlation_reward": 0.08329666405916214, "step": 69 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5459973753280839, "calib/avg_num_step_conf": 4.26953125, "calib/ece": 0.34773279352226727, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.32793522267206476, "calib/gap": 0.046930446194225595, "calib/mean_conf": 0.8230364372469636, "calib/mu_c": 0.8471666666666666, "calib/mu_w": 0.800236220472441, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3424696356275304, "calib/std_conf": 0.16960089698869982, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7481395348837209, "calib/step_q_c_n": 516.0, "calib/step_q_gap": 0.06268026278666716, "calib/step_q_w": 0.6854592720970537, "calib/step_q_w_n": 577.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 540.5078125, "completions/mean_terminated_length": 544.7637939453125, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.07466666666666667, "grad_norm": 0.00454904418438673, "kl": 0.085479736328125, "learning_rate": 3.6111111111111115e-06, "loss": 0.0238, "num_tokens": 16567188.0, "reward": 0.5172110795974731, "reward_std": 0.39061713218688965, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6089847683906555, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.1387186497449875, "step": 70 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4943941778127458, "calib/avg_num_step_conf": 4.3515625, "calib/ece": 0.3520647773279352, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.27530364372469635, "calib/gap": -0.015126540781536746, "calib/mean_conf": 0.8280971659919029, "calib/mu_c": 0.8205645161290323, "calib/mu_w": 0.835691056910569, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.3390688259109311, "calib/std_conf": 0.13513834069416286, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7431620553359685, "calib/step_q_c_n": 506.0, "calib/step_q_gap": -0.002956365716663134, "calib/step_q_w": 0.7461184210526316, "calib/step_q_w_n": 608.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2416.0, "completions/max_terminated_length": 2416.0, "completions/mean_length": 535.18359375, "completions/mean_terminated_length": 537.2824096679688, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.07573333333333333, "grad_norm": 0.0041601029224693775, "kl": 0.08847808837890625, "learning_rate": 3.5833333333333335e-06, "loss": 0.0424, "num_tokens": 16808603.0, "reward": 0.509465217590332, "reward_std": 0.4146815538406372, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.5898030996322632, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.1416272521018982, "step": 71 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5870607532647784, "calib/avg_num_step_conf": 4.23828125, "calib/ece": 0.36829365079365084, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.2857142857142857, "calib/gap": 0.032716547851870614, "calib/mean_conf": 0.842579365079365, "calib/mu_c": 0.8595867768595041, "calib/mu_w": 0.8268702290076335, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.3653571428571428, "calib/std_conf": 0.11763920178419383, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7598109640831759, "calib/step_q_c_n": 529.0, "calib/step_q_gap": 0.016303769838571625, "calib/step_q_w": 0.7435071942446043, "calib/step_q_w_n": 556.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2425.0, "completions/max_terminated_length": 2425.0, "completions/mean_length": 475.34375, "completions/mean_terminated_length": 475.34375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.0768, "grad_norm": 0.004811810329556465, "kl": 0.10382080078125, "learning_rate": 3.555555555555556e-06, "loss": 0.0235, "num_tokens": 17034699.0, "reward": 0.5658941268920898, "reward_std": 0.3956160843372345, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.603821873664856, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.23890388011932373, "step": 72 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5032398793866684, "calib/avg_num_step_conf": 4.1484375, "calib/ece": 0.2762698412698412, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.2619047619047619, "calib/gap": 0.01088342849810775, "calib/mean_conf": 0.8174603174603174, "calib/mu_c": 0.8221678321678323, "calib/mu_w": 0.8112844036697245, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2631349206349206, "calib/std_conf": 0.1501028034378413, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.739360606060606, "calib/step_q_c_n": 594.0, "calib/step_q_gap": 0.003719580419580426, "calib/step_q_w": 0.7356410256410256, "calib/step_q_w_n": 468.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 502.109375, "completions/mean_terminated_length": 502.109375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.07786666666666667, "grad_norm": 0.004434308968484402, "kl": 0.08699798583984375, "learning_rate": 3.5277777777777784e-06, "loss": -0.0117, "num_tokens": 17270271.0, "reward": 0.615314245223999, "reward_std": 0.418173611164093, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6604995727539062, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2623162865638733, "step": 73 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5044928522804628, "calib/avg_num_step_conf": 4.1640625, "calib/ece": 0.3879012345679014, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.27983539094650206, "calib/gap": 0.00554731109598372, "calib/mean_conf": 0.8300411522633746, "calib/mu_c": 0.8330088495575221, "calib/mu_w": 0.8274615384615384, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.3764609053497944, "calib/std_conf": 0.13532919151475983, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7249667405764968, "calib/step_q_c_n": 451.0, "calib/step_q_gap": -0.0083503325942349, "calib/step_q_w": 0.7333170731707317, "calib/step_q_w_n": 615.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2985.0, "completions/max_terminated_length": 2985.0, "completions/mean_length": 511.58203125, "completions/mean_terminated_length": 515.6102294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.07893333333333333, "grad_norm": 0.004572301171720028, "kl": 0.1038818359375, "learning_rate": 3.5e-06, "loss": 0.0399, "num_tokens": 17505164.0, "reward": 0.4876174330711365, "reward_std": 0.41045689582824707, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.5689581632614136, "rewards/format_reward_step": 0.94140625, "rewards/step_correlation_reward": 0.12971413135528564, "step": 74 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5626369112814896, "calib/avg_num_step_conf": 4.1875, "calib/ece": 0.1933464566929133, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.25984251968503935, "calib/gap": 0.029557776560788396, "calib/mean_conf": 0.8139763779527559, "calib/mu_c": 0.8242168674698794, "calib/mu_w": 0.794659090909091, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1768897637795275, "calib/std_conf": 0.15970301988412466, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7443496503496504, "calib/step_q_c_n": 715.0, "calib/step_q_gap": 0.0305401265401265, "calib/step_q_w": 0.7138095238095239, "calib/step_q_w_n": 357.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2550.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 455.59375, "completions/mean_terminated_length": 455.59375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.08, "grad_norm": 0.004388398490846157, "kl": 0.107330322265625, "learning_rate": 3.4722222222222224e-06, "loss": 0.018, "num_tokens": 17726548.0, "reward": 0.7035642266273499, "reward_std": 0.3689558506011963, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7299691438674927, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.34903430938720703, "step": 75 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5625316937119675, "calib/avg_num_step_conf": 4.2421875, "calib/ece": 0.29103174603174603, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.2698412698412698, "calib/gap": 0.034467545638945385, "calib/mean_conf": 0.8072222222222222, "calib/mu_c": 0.8230882352941177, "calib/mu_w": 0.7886206896551723, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27928571428571425, "calib/std_conf": 0.1687965756483741, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6995469798657719, "calib/step_q_c_n": 596.0, "calib/step_q_gap": -0.00010608135871792435, "calib/step_q_w": 0.6996530612244898, "calib/step_q_w_n": 490.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 459.72265625, "completions/mean_terminated_length": 463.342529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.08106666666666666, "grad_norm": 0.0049783228896558285, "kl": 0.1104736328125, "learning_rate": 3.444444444444445e-06, "loss": 0.0207, "num_tokens": 17947293.0, "reward": 0.5709596276283264, "reward_std": 0.4218650758266449, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6571656465530396, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.18240979313850403, "step": 76 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.383743842364532, "calib/avg_num_step_conf": 4.734375, "calib/ece": 0.2728800000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.252, "calib/gap": -0.04243021346469622, "calib/mean_conf": 0.8051999999999999, "calib/mu_c": 0.7873793103448276, "calib/mu_w": 0.8298095238095238, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2490400000000001, "calib/std_conf": 0.14694815412246592, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7146539027982327, "calib/step_q_c_n": 679.0, "calib/step_q_gap": -0.026359230410022527, "calib/step_q_w": 0.7410131332082552, "calib/step_q_w_n": 533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 515.83203125, "completions/mean_terminated_length": 515.83203125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.08213333333333334, "grad_norm": 0.004251775331795216, "kl": 0.0996856689453125, "learning_rate": 3.416666666666667e-06, "loss": 0.0346, "num_tokens": 18184010.0, "reward": 0.5908552408218384, "reward_std": 0.39187467098236084, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6375484466552734, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.23869329690933228, "step": 77 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.524085637823372, "calib/avg_num_step_conf": 4.4765625, "calib/ece": 0.30290836653386455, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.21115537848605578, "calib/gap": 0.008916146297948457, "calib/mean_conf": 0.8063346613545816, "calib/mu_c": 0.8105263157894738, "calib/mu_w": 0.8016101694915253, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2896812749003984, "calib/std_conf": 0.1452683721736049, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.71395, "calib/step_q_c_n": 600.0, "calib/step_q_gap": -0.004474908424908386, "calib/step_q_w": 0.7184249084249084, "calib/step_q_w_n": 546.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1937.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 515.8828125, "completions/mean_terminated_length": 515.8828125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.0832, "grad_norm": 0.004391307011246681, "kl": 0.1013946533203125, "learning_rate": 3.3888888888888893e-06, "loss": 0.0049, "num_tokens": 18424100.0, "reward": 0.5536520481109619, "reward_std": 0.3932393789291382, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6371738314628601, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.17169275879859924, "step": 78 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5052199159871511, "calib/avg_num_step_conf": 4.6953125, "calib/ece": 0.28796875, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.2421875, "calib/gap": 0.0026809982703236424, "calib/mean_conf": 0.807890625, "calib/mu_c": 0.8090845070422534, "calib/mu_w": 0.8064035087719298, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2705859375, "calib/std_conf": 0.15587280643880566, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7383275862068965, "calib/step_q_c_n": 696.0, "calib/step_q_gap": 0.014039048657489395, "calib/step_q_w": 0.7242885375494071, "calib/step_q_w_n": 506.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1273.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 503.09375, "completions/mean_terminated_length": 505.06671142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.08426666666666667, "grad_norm": 0.004289821721613407, "kl": 0.0935821533203125, "learning_rate": 3.3611111111111117e-06, "loss": 0.0034, "num_tokens": 18659268.0, "reward": 0.5821490287780762, "reward_std": 0.3844422698020935, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6659070253372192, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.1874534785747528, "step": 79 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5968559837728195, "calib/avg_num_step_conf": 4.86328125, "calib/ece": 0.2800000000000001, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.23809523809523808, "calib/gap": 0.04668356997971601, "calib/mean_conf": 0.8167460317460319, "calib/mu_c": 0.8382352941176471, "calib/mu_w": 0.7915517241379311, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.27853174603174613, "calib/std_conf": 0.1320192949166322, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7245037037037036, "calib/step_q_c_n": 675.0, "calib/step_q_gap": -0.002162962962963033, "calib/step_q_w": 0.7266666666666667, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1733.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 450.1875, "completions/mean_terminated_length": 453.7322692871094, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.08533333333333333, "grad_norm": 0.004870356526225805, "kl": 0.108734130859375, "learning_rate": 3.3333333333333333e-06, "loss": -0.0003, "num_tokens": 18876676.0, "reward": 0.534776508808136, "reward_std": 0.40727341175079346, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6685359477996826, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.09867335855960846, "step": 80 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5414757412398922, "calib/avg_num_step_conf": 4.6875, "calib/ece": 0.2494715447154471, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.1991869918699187, "calib/gap": 0.02374258760107828, "calib/mean_conf": 0.7789837398373982, "calib/mu_c": 0.7892142857142856, "calib/mu_w": 0.7654716981132074, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.22967479674796745, "calib/std_conf": 0.18046288647293576, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7274414976599064, "calib/step_q_c_n": 641.0, "calib/step_q_gap": 0.012504109466704327, "calib/step_q_w": 0.7149373881932021, "calib/step_q_w_n": 559.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 537.35546875, "completions/mean_terminated_length": 539.4627685546875, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.0864, "grad_norm": 0.004383731633424759, "kl": 0.09157562255859375, "learning_rate": 3.3055555555555558e-06, "loss": 0.0371, "num_tokens": 19120487.0, "reward": 0.5714367628097534, "reward_std": 0.4036829471588135, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6617749929428101, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.18031713366508484, "step": 81 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.48127915149191747, "calib/avg_num_step_conf": 4.73046875, "calib/ece": 0.3093253968253968, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.1865079365079365, "calib/gap": -0.030613379336783653, "calib/mean_conf": 0.7924206349206349, "calib/mu_c": 0.7789361702127658, "calib/mu_w": 0.8095495495495495, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2711111111111111, "calib/std_conf": 0.17541360887164556, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7182456140350878, "calib/step_q_c_n": 627.0, "calib/step_q_gap": -0.020042057197788843, "calib/step_q_w": 0.7382876712328766, "calib/step_q_w_n": 584.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2046.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 481.953125, "completions/mean_terminated_length": 481.953125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.08746666666666666, "grad_norm": 0.004382080864161253, "kl": 0.1011962890625, "learning_rate": 3.277777777777778e-06, "loss": 0.0266, "num_tokens": 19349419.0, "reward": 0.5547659993171692, "reward_std": 0.37202906608581543, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6337062120437622, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.1711382269859314, "step": 82 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5297007874015748, "calib/avg_num_step_conf": 4.92578125, "calib/ece": 0.3159722222222222, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.24206349206349206, "calib/gap": 0.032206299212598544, "calib/mean_conf": 0.7813690476190476, "calib/mu_c": 0.7976, "calib/mu_w": 0.7653937007874014, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.30065476190476187, "calib/std_conf": 0.18737167755529816, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7250498338870432, "calib/step_q_c_n": 602.0, "calib/step_q_gap": 0.04211356681572309, "calib/step_q_w": 0.6829362670713202, "calib/step_q_w_n": 659.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2480.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 546.04296875, "completions/mean_terminated_length": 546.04296875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.08853333333333334, "grad_norm": 0.0038687747437506914, "kl": 0.0943603515625, "learning_rate": 3.2500000000000002e-06, "loss": 0.0037, "num_tokens": 19596470.0, "reward": 0.5185824036598206, "reward_std": 0.3328552842140198, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6270061731338501, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.11953361332416534, "step": 83 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.496384, "calib/avg_num_step_conf": 4.4453125, "calib/ece": 0.3124, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.16, "calib/gap": 0.0033600000000001407, "calib/mean_conf": 0.78112, "calib/mu_c": 0.7828, "calib/mu_w": 0.7794399999999999, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.29676, "calib/std_conf": 0.1631451672591009, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7255148342059338, "calib/step_q_c_n": 573.0, "calib/step_q_gap": 0.014465276683809924, "calib/step_q_w": 0.7110495575221238, "calib/step_q_w_n": 565.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2708.0, "completions/max_terminated_length": 2708.0, "completions/mean_length": 495.66015625, "completions/mean_terminated_length": 497.60394287109375, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.0896, "grad_norm": 0.004789901431649923, "kl": 0.10146331787109375, "learning_rate": 3.2222222222222227e-06, "loss": 0.0466, "num_tokens": 19829279.0, "reward": 0.536548912525177, "reward_std": 0.3873344659805298, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.6269879341125488, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.15392239391803741, "step": 84 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5752652519893899, "calib/avg_num_step_conf": 4.625, "calib/ece": 0.3207317073170731, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.15853658536585366, "calib/gap": 0.05572413793103437, "calib/mean_conf": 0.7922764227642275, "calib/mu_c": 0.8217241379310343, "calib/mu_w": 0.7659999999999999, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3207317073170731, "calib/std_conf": 0.14751223186503895, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7304512635379061, "calib/step_q_c_n": 554.0, "calib/step_q_gap": 0.01986396195060458, "calib/step_q_w": 0.7105873015873015, "calib/step_q_w_n": 630.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2717.0, "completions/max_terminated_length": 2717.0, "completions/mean_length": 522.79296875, "completions/mean_terminated_length": 522.79296875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.09066666666666667, "grad_norm": 0.004474072251468897, "kl": 0.08792877197265625, "learning_rate": 3.1944444444444443e-06, "loss": 0.0157, "num_tokens": 20070938.0, "reward": 0.532092273235321, "reward_std": 0.39479580521583557, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6251652240753174, "rewards/format_reward_step": 0.94921875, "rewards/step_correlation_reward": 0.15855059027671814, "step": 85 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.556257982120051, "calib/avg_num_step_conf": 4.62890625, "calib/ece": 0.33151394422310754, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.2151394422310757, "calib/gap": 0.04475095785440608, "calib/mean_conf": 0.781792828685259, "calib/mu_c": 0.8058620689655172, "calib/mu_w": 0.7611111111111111, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.3255776892430279, "calib/std_conf": 0.17219341051812373, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7266970802919709, "calib/step_q_c_n": 548.0, "calib/step_q_gap": 0.030689230998407258, "calib/step_q_w": 0.6960078492935636, "calib/step_q_w_n": 637.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 493.74609375, "completions/mean_terminated_length": 495.682373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.09173333333333333, "grad_norm": 0.004494712222367525, "kl": 0.094451904296875, "learning_rate": 3.1666666666666667e-06, "loss": -0.0006, "num_tokens": 20302849.0, "reward": 0.5276005268096924, "reward_std": 0.3682413399219513, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6204468607902527, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.15116043388843536, "step": 86 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.502650295222759, "calib/avg_num_step_conf": 4.44140625, "calib/ece": 0.22822834645669293, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.18110236220472442, "calib/gap": -0.01292807300053711, "calib/mean_conf": 0.7861023622047245, "calib/mu_c": 0.7814197530864198, "calib/mu_w": 0.7943478260869569, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1882677165354331, "calib/std_conf": 0.1742005512094152, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7099722607489598, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.020933799210498316, "calib/step_q_w": 0.6890384615384615, "calib/step_q_w_n": 416.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1076.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 436.3203125, "completions/mean_terminated_length": 438.0314025878906, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.0928, "grad_norm": 0.005090489983558655, "kl": 0.1055450439453125, "learning_rate": 3.138888888888889e-06, "loss": 0.0088, "num_tokens": 20520043.0, "reward": 0.6783336400985718, "reward_std": 0.3884207308292389, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7014593482017517, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.33098912239074707, "step": 87 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.530716296527995, "calib/avg_num_step_conf": 5.00390625, "calib/ece": 0.26669291338582674, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2440944881889764, "calib/gap": 0.02194932749452594, "calib/mean_conf": 0.7990551181102361, "calib/mu_c": 0.8089928057553956, "calib/mu_w": 0.7870434782608696, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.259251968503937, "calib/std_conf": 0.1491811405976822, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7272056737588654, "calib/step_q_c_n": 705.0, "calib/step_q_gap": 0.01809109042553203, "calib/step_q_w": 0.7091145833333333, "calib/step_q_w_n": 576.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1999.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 511.73828125, "completions/mean_terminated_length": 513.7451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.09386666666666667, "grad_norm": 0.00454970495775342, "kl": 0.0915679931640625, "learning_rate": 3.1111111111111116e-06, "loss": 0.0229, "num_tokens": 20760896.0, "reward": 0.6263402700424194, "reward_std": 0.3962945342063904, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6637749671936035, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2849992513656616, "step": 88 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5435695538057742, "calib/avg_num_step_conf": 4.421875, "calib/ece": 0.3020242914979757, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.18218623481781376, "calib/gap": 0.027177821522309853, "calib/mean_conf": 0.7791093117408907, "calib/mu_c": 0.7930833333333334, "calib/mu_w": 0.7659055118110235, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2976518218623482, "calib/std_conf": 0.15574301536944893, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7219411764705882, "calib/step_q_c_n": 510.0, "calib/step_q_gap": 0.030799697370909818, "calib/step_q_w": 0.6911414790996784, "calib/step_q_w_n": 622.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2394.0, "completions/max_terminated_length": 2394.0, "completions/mean_length": 535.5859375, "completions/mean_terminated_length": 537.686279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.09493333333333333, "grad_norm": 0.004490287974476814, "kl": 0.08609771728515625, "learning_rate": 3.0833333333333336e-06, "loss": -0.0259, "num_tokens": 21006894.0, "reward": 0.5326032638549805, "reward_std": 0.30912187695503235, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6257109642028809, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.15433922410011292, "step": 89 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6094750320102433, "calib/avg_num_step_conf": 5.51953125, "calib/ece": 0.26063492063492066, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.19047619047619047, "calib/gap": 0.046624839948783636, "calib/mean_conf": 0.7949999999999999, "calib/mu_c": 0.8153521126760563, "calib/mu_w": 0.7687272727272727, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24607142857142858, "calib/std_conf": 0.16672974996625384, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7206089309878213, "calib/step_q_c_n": 739.0, "calib/step_q_gap": 0.02218163128455719, "calib/step_q_w": 0.6984272997032641, "calib/step_q_w_n": 674.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 509.3828125, "completions/mean_terminated_length": 511.38043212890625, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.096, "grad_norm": 0.004317185375839472, "kl": 0.0949859619140625, "learning_rate": 3.055555555555556e-06, "loss": 0.0119, "num_tokens": 21240616.0, "reward": 0.6395683884620667, "reward_std": 0.3783494830131531, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6792097687721252, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2936769425868988, "step": 90 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5705386237301131, "calib/avg_num_step_conf": 5.0546875, "calib/ece": 0.2511111111111112, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.20634920634920634, "calib/gap": 0.03691010159095254, "calib/mean_conf": 0.7865079365079366, "calib/mu_c": 0.8027659574468085, "calib/mu_w": 0.7658558558558559, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.23904761904761912, "calib/std_conf": 0.16857864002749512, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7056178217821782, "calib/step_q_c_n": 707.0, "calib/step_q_gap": 0.004203852446573397, "calib/step_q_w": 0.7014139693356048, "calib/step_q_w_n": 587.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1629.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 519.99609375, "completions/mean_terminated_length": 519.99609375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.09706666666666666, "grad_norm": 0.004235545173287392, "kl": 0.0934906005859375, "learning_rate": 3.0277777777777776e-06, "loss": -0.0043, "num_tokens": 21481447.0, "reward": 0.5762742757797241, "reward_std": 0.4081330895423889, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6798367500305176, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.16646182537078857, "step": 91 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5041772959183672, "calib/avg_num_step_conf": 4.41015625, "calib/ece": 0.28130952380952373, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.15476190476190477, "calib/gap": 0.0013571428571428346, "calib/mean_conf": 0.8050396825396826, "calib/mu_c": 0.8056428571428571, "calib/mu_w": 0.8042857142857143, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.26539682539682535, "calib/std_conf": 0.1355075644109864, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7301848739495799, "calib/step_q_c_n": 595.0, "calib/step_q_gap": 0.004106222264186665, "calib/step_q_w": 0.7260786516853932, "calib/step_q_w_n": 534.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2207.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 473.8984375, "completions/mean_terminated_length": 475.75689697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.09813333333333334, "grad_norm": 0.004517954774200916, "kl": 0.09569549560546875, "learning_rate": 3e-06, "loss": -0.0145, "num_tokens": 21709485.0, "reward": 0.6152100563049316, "reward_std": 0.3936840295791626, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6535598039627075, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.27373531460762024, "step": 92 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5125, "calib/avg_num_step_conf": 5.43359375, "calib/ece": 0.2835458167330677, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.16733067729083664, "calib/gap": 0.026701653944020465, "calib/mean_conf": 0.7968525896414342, "calib/mu_c": 0.8096183206106871, "calib/mu_w": 0.7829166666666666, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.27924302788844624, "calib/std_conf": 0.15364902243388137, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7297272727272728, "calib/step_q_c_n": 660.0, "calib/step_q_gap": 0.016033702275836337, "calib/step_q_w": 0.7136935704514364, "calib/step_q_w_n": 731.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2662.0, "completions/max_terminated_length": 2662.0, "completions/mean_length": 542.171875, "completions/mean_terminated_length": 542.171875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.0992, "grad_norm": 0.00440006610006094, "kl": 0.08103179931640625, "learning_rate": 2.9722222222222225e-06, "loss": 0.0513, "num_tokens": 21954057.0, "reward": 0.5715123414993286, "reward_std": 0.4189292788505554, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6469859480857849, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.1991637945175171, "step": 93 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5303413400758534, "calib/avg_num_step_conf": 5.0703125, "calib/ece": 0.26458498023715404, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.20553359683794467, "calib/gap": 0.031416561314791314, "calib/mean_conf": 0.8100395256916996, "calib/mu_c": 0.8240714285714285, "calib/mu_w": 0.7926548672566371, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2606324110671936, "calib/std_conf": 0.12895903698589095, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7232162921348315, "calib/step_q_c_n": 712.0, "calib/step_q_gap": 0.029717998619473218, "calib/step_q_w": 0.6934982935153583, "calib/step_q_w_n": 586.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2273.0, "completions/max_terminated_length": 2273.0, "completions/mean_length": 492.11328125, "completions/mean_terminated_length": 495.9881896972656, "completions/min_length": 0.0, "completions/min_terminated_length": 202.0, "epoch": 0.10026666666666667, "grad_norm": 0.004632134456187487, "kl": 0.0995025634765625, "learning_rate": 2.944444444444445e-06, "loss": -0.0086, "num_tokens": 22188718.0, "reward": 0.6204875707626343, "reward_std": 0.3555172085762024, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6778242588043213, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.25611966848373413, "step": 94 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5884313725490197, "calib/avg_num_step_conf": 5.56640625, "calib/ece": 0.19924901185770755, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.15019762845849802, "calib/gap": 0.043509803921568446, "calib/mean_conf": 0.7673122529644268, "calib/mu_c": 0.7845098039215685, "calib/mu_w": 0.7410000000000001, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18090909090909096, "calib/std_conf": 0.18145407777171457, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7088129899216126, "calib/step_q_c_n": 893.0, "calib/step_q_gap": 0.03947088465845461, "calib/step_q_w": 0.669342105263158, "calib/step_q_w_n": 532.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2614.0, "completions/max_terminated_length": 2614.0, "completions/mean_length": 527.21875, "completions/mean_terminated_length": 529.2863159179688, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.10133333333333333, "grad_norm": 0.004121353849768639, "kl": 0.081787109375, "learning_rate": 2.916666666666667e-06, "loss": 0.0126, "num_tokens": 22429814.0, "reward": 0.6321449279785156, "reward_std": 0.3281705379486084, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7107530832290649, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.23791177570819855, "step": 95 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5467836257309941, "calib/avg_num_step_conf": 5.265625, "calib/ece": 0.17492125984251963, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.17716535433070865, "calib/gap": 0.006558867047135708, "calib/mean_conf": 0.8061023622047244, "calib/mu_c": 0.8082456140350875, "calib/mu_w": 0.8016867469879518, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15389763779527557, "calib/std_conf": 0.1393212079002976, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7324615384615384, "calib/step_q_c_n": 910.0, "calib/step_q_gap": 0.03467615033368454, "calib/step_q_w": 0.6977853881278538, "calib/step_q_w_n": 438.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2744.0, "completions/max_terminated_length": 2744.0, "completions/mean_length": 490.96484375, "completions/mean_terminated_length": 490.96484375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.1024, "grad_norm": 0.004674531519412994, "kl": 0.10054779052734375, "learning_rate": 2.888888888888889e-06, "loss": -0.0372, "num_tokens": 22661317.0, "reward": 0.6969508528709412, "reward_std": 0.30746543407440186, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7400012016296387, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.32186925411224365, "step": 96 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5414687499999999, "calib/avg_num_step_conf": 5.140625, "calib/ece": 0.30486166007905124, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.16600790513833993, "calib/gap": 0.02685499999999985, "calib/mean_conf": 0.788893280632411, "calib/mu_c": 0.8024799999999999, "calib/mu_w": 0.775625, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2998418972332014, "calib/std_conf": 0.1536292199667022, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7190620031796502, "calib/step_q_c_n": 629.0, "calib/step_q_gap": 0.010648611622153803, "calib/step_q_w": 0.7084133915574964, "calib/step_q_w_n": 687.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2592.0, "completions/max_terminated_length": 2592.0, "completions/mean_length": 503.1796875, "completions/mean_terminated_length": 505.1529846191406, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.10346666666666667, "grad_norm": 0.004628482274711132, "kl": 0.0954132080078125, "learning_rate": 2.861111111111111e-06, "loss": -0.0009, "num_tokens": 22895203.0, "reward": 0.535427987575531, "reward_std": 0.4217776358127594, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.6416449546813965, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.13467980921268463, "step": 97 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5434121621621621, "calib/avg_num_step_conf": 5.2265625, "calib/ece": 0.23955645161290323, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.18951612903225806, "calib/gap": 0.013878378378378575, "calib/mean_conf": 0.7977822580645162, "calib/mu_c": 0.8033783783783784, "calib/mu_w": 0.7894999999999999, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22028225806451612, "calib/std_conf": 0.1505350704226733, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7124238544474394, "calib/step_q_c_n": 742.0, "calib/step_q_gap": 0.004319827601801718, "calib/step_q_w": 0.7081040268456377, "calib/step_q_w_n": 596.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2832.0, "completions/max_terminated_length": 2832.0, "completions/mean_length": 580.94140625, "completions/mean_terminated_length": 583.2196655273438, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.10453333333333334, "grad_norm": 0.004246114287525415, "kl": 0.07635498046875, "learning_rate": 2.8333333333333335e-06, "loss": 0.0686, "num_tokens": 23150108.0, "reward": 0.6070481538772583, "reward_std": 0.3663785457611084, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6810113191604614, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.22370991110801697, "step": 98 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5169593117551654, "calib/avg_num_step_conf": 5.63671875, "calib/ece": 0.45336, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.196, "calib/gap": 0.018194767646851573, "calib/mean_conf": 0.79216, "calib/mu_c": 0.8040229885057472, "calib/mu_w": 0.7858282208588956, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.44876, "calib/std_conf": 0.1691039159806774, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7280092843326886, "calib/step_q_c_n": 517.0, "calib/step_q_gap": 0.03957515906271025, "calib/step_q_w": 0.6884341252699784, "calib/step_q_w_n": 926.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2670.0, "completions/max_terminated_length": 2670.0, "completions/mean_length": 654.35546875, "completions/mean_terminated_length": 654.35546875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.1056, "grad_norm": 0.004029831383377314, "kl": 0.07680511474609375, "learning_rate": 2.805555555555556e-06, "loss": 0.0207, "num_tokens": 23423423.0, "reward": 0.3443741202354431, "reward_std": 0.39202311635017395, "rewards/accuracy_reward_step": 0.33984375, "rewards/final_brier_reward_step": 0.5394691824913025, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": -0.11243967711925507, "step": 99 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6053447420634921, "calib/avg_num_step_conf": 5.41015625, "calib/ece": 0.2951968503937008, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.1377952755905512, "calib/gap": 0.057713293650793585, "calib/mean_conf": 0.7754330708661418, "calib/mu_c": 0.8040625, "calib/mu_w": 0.7463492063492064, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2833464566929134, "calib/std_conf": 0.16153425762553517, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7255524079320114, "calib/step_q_c_n": 706.0, "calib/step_q_gap": 0.03624460233554605, "calib/step_q_w": 0.6893078055964653, "calib/step_q_w_n": 679.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1979.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 556.93359375, "completions/mean_terminated_length": 559.11767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.10666666666666667, "grad_norm": 0.004563473630696535, "kl": 0.08527374267578125, "learning_rate": 2.7777777777777783e-06, "loss": 0.0299, "num_tokens": 23673406.0, "reward": 0.5480512976646423, "reward_std": 0.3751283586025238, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6646746397018433, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.13533419370651245, "step": 100 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5611288392500997, "calib/avg_num_step_conf": 5.953125, "calib/ece": 0.355587044534413, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.18218623481781376, "calib/gap": 0.060533173780082494, "calib/mean_conf": 0.79251012145749, "calib/mu_c": 0.8263302752293579, "calib/mu_w": 0.7657971014492754, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3534008097165992, "calib/std_conf": 0.18371386531917389, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.732985315712188, "calib/step_q_c_n": 681.0, "calib/step_q_gap": 0.0508738091878701, "calib/step_q_w": 0.6821115065243178, "calib/step_q_w_n": 843.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2217.0, "completions/max_terminated_length": 2217.0, "completions/mean_length": 619.71484375, "completions/mean_terminated_length": 622.1451416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.10773333333333333, "grad_norm": 0.004082713276147842, "kl": 0.08042144775390625, "learning_rate": 2.7500000000000004e-06, "loss": 0.0349, "num_tokens": 23939045.0, "reward": 0.5178720951080322, "reward_std": 0.37443262338638306, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.6013695001602173, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.15624955296516418, "step": 101 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.517766329832904, "calib/avg_num_step_conf": 5.7109375, "calib/ece": 0.23312, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.212, "calib/gap": 0.01893402021002588, "calib/mean_conf": 0.80424, "calib/mu_c": 0.8120408163265307, "calib/mu_w": 0.7931067961165048, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22468, "calib/std_conf": 0.14438982789656618, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7196905940594059, "calib/step_q_c_n": 808.0, "calib/step_q_gap": -0.016349161292276038, "calib/step_q_w": 0.736039755351682, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2713.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 534.40234375, "completions/mean_terminated_length": 534.40234375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.1088, "grad_norm": 0.004449573345482349, "kl": 0.0848236083984375, "learning_rate": 2.7222222222222224e-06, "loss": 0.012, "num_tokens": 24182548.0, "reward": 0.5914692878723145, "reward_std": 0.316368043422699, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6829195022583008, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.18986272811889648, "step": 102 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5246334698190463, "calib/avg_num_step_conf": 5.19921875, "calib/ece": 0.26611336032388666, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.17813765182186234, "calib/gap": 0.030098401796328167, "calib/mean_conf": 0.7933198380566802, "calib/mu_c": 0.807089552238806, "calib/mu_w": 0.7769911504424778, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.25846153846153846, "calib/std_conf": 0.1614067426411123, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.7253805970149254, "calib/step_q_c_n": 670.0, "calib/step_q_gap": -0.006268419626526911, "calib/step_q_w": 0.7316490166414523, "calib/step_q_w_n": 661.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2265.0, "completions/max_terminated_length": 2265.0, "completions/mean_length": 630.05078125, "completions/mean_terminated_length": 632.5216064453125, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.10986666666666667, "grad_norm": 0.004228085279464722, "kl": 0.0755157470703125, "learning_rate": 2.6944444444444444e-06, "loss": 0.0224, "num_tokens": 24448393.0, "reward": 0.5414041876792908, "reward_std": 0.33534783124923706, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6473226547241211, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.13939201831817627, "step": 103 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5704055059523809, "calib/avg_num_step_conf": 5.53515625, "calib/ece": 0.30763779527559054, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.12992125984251968, "calib/gap": 0.037669890873015976, "calib/mean_conf": 0.7881102362204725, "calib/mu_c": 0.8067968750000001, "calib/mu_w": 0.7691269841269841, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29590551181102365, "calib/std_conf": 0.15763624170338938, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.737034990791897, "calib/step_q_c_n": 724.0, "calib/step_q_gap": 0.0328502866071928, "calib/step_q_w": 0.7041847041847042, "calib/step_q_w_n": 693.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2902.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 571.26171875, "completions/mean_terminated_length": 571.26171875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.11093333333333333, "grad_norm": 0.004389981739223003, "kl": 0.0796051025390625, "learning_rate": 2.666666666666667e-06, "loss": 0.0152, "num_tokens": 24701316.0, "reward": 0.5887982249259949, "reward_std": 0.2890593409538269, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.658064067363739, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.22109487652778625, "step": 104 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5200482172313158, "calib/avg_num_step_conf": 5.84375, "calib/ece": 0.25849802371541497, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.14624505928853754, "calib/gap": 0.024042634183479183, "calib/mean_conf": 0.7691699604743083, "calib/mu_c": 0.7797183098591549, "calib/mu_w": 0.7556756756756757, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23320158102766794, "calib/std_conf": 0.17764517741301394, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7048106060606061, "calib/step_q_c_n": 792.0, "calib/step_q_gap": 0.00456912878787874, "calib/step_q_w": 0.7002414772727273, "calib/step_q_w_n": 704.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2781.0, "completions/max_terminated_length": 2781.0, "completions/mean_length": 584.81640625, "completions/mean_terminated_length": 584.81640625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.112, "grad_norm": 0.00401820195838809, "kl": 0.08333587646484375, "learning_rate": 2.6388888888888893e-06, "loss": 0.0686, "num_tokens": 24956789.0, "reward": 0.6147169470787048, "reward_std": 0.4103122353553772, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.678810179233551, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.2428111732006073, "step": 105 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5796992481203007, "calib/avg_num_step_conf": 5.5234375, "calib/ece": 0.28367588932806315, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.1383399209486166, "calib/gap": 0.03851002506265677, "calib/mean_conf": 0.8024110671936759, "calib/mu_c": 0.8206766917293233, "calib/mu_w": 0.7821666666666666, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28019762845849794, "calib/std_conf": 0.13706161519451995, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7453986013986015, "calib/step_q_c_n": 715.0, "calib/step_q_gap": 0.02011963144151996, "calib/step_q_w": 0.7252789699570815, "calib/step_q_w_n": 699.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1616.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 527.32421875, "completions/mean_terminated_length": 529.3922119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.11306666666666666, "grad_norm": 0.004746657330542803, "kl": 0.08556365966796875, "learning_rate": 2.6111111111111113e-06, "loss": 0.0056, "num_tokens": 25196368.0, "reward": 0.5568984746932983, "reward_std": 0.35886260867118835, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6666004061698914, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.1456340253353119, "step": 106 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5674223602484472, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.28168627450980394, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.1803921568627451, "calib/gap": 0.025102484472049658, "calib/mean_conf": 0.785607843137255, "calib/mu_c": 0.7969285714285714, "calib/mu_w": 0.7718260869565218, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2591372549019608, "calib/std_conf": 0.15821654042474634, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7204850746268656, "calib/step_q_c_n": 804.0, "calib/step_q_gap": 0.008226507648672499, "calib/step_q_w": 0.7122585669781931, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1325.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 511.19921875, "completions/mean_terminated_length": 513.2039794921875, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.11413333333333334, "grad_norm": 0.004514755215495825, "kl": 0.0935516357421875, "learning_rate": 2.5833333333333337e-06, "loss": 0.0336, "num_tokens": 25431851.0, "reward": 0.6089458465576172, "reward_std": 0.37728819251060486, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6811558604240417, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.22814197838306427, "step": 107 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5240497076023392, "calib/avg_num_step_conf": 6.40625, "calib/ece": 0.1715537848605578, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.20318725099601595, "calib/gap": 0.00889400584795308, "calib/mean_conf": 0.8214342629482072, "calib/mu_c": 0.8242690058479532, "calib/mu_w": 0.8153750000000001, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1558565737051793, "calib/std_conf": 0.12750570122928112, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7346252788104088, "calib/step_q_c_n": 1076.0, "calib/step_q_gap": -0.01913358643781815, "calib/step_q_w": 0.753758865248227, "calib/step_q_w_n": 564.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 572.57421875, "completions/mean_terminated_length": 574.8196411132812, "completions/min_length": 0.0, "completions/min_terminated_length": 31.0, "epoch": 0.1152, "grad_norm": 0.004031268414109945, "kl": 0.07091522216796875, "learning_rate": 2.5555555555555557e-06, "loss": -0.021, "num_tokens": 25681662.0, "reward": 0.696212887763977, "reward_std": 0.34606415033340454, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7361562252044678, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.3265819549560547, "step": 108 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6031997414350355, "calib/avg_num_step_conf": 6.375, "calib/ece": 0.3474698795180722, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.17670682730923695, "calib/gap": 0.051499676793794524, "calib/mean_conf": 0.8253815261044176, "calib/mu_c": 0.8522689075630252, "calib/mu_w": 0.8007692307692307, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3474698795180722, "calib/std_conf": 0.12503796620246393, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7522580645161291, "calib/step_q_c_n": 744.0, "calib/step_q_gap": 0.04583914559721014, "calib/step_q_w": 0.706418918918919, "calib/step_q_w_n": 888.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2096.0, "completions/max_terminated_length": 2096.0, "completions/mean_length": 584.484375, "completions/mean_terminated_length": 586.7764892578125, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.11626666666666667, "grad_norm": 0.004027488175779581, "kl": 0.0707855224609375, "learning_rate": 2.5277777777777778e-06, "loss": -0.035, "num_tokens": 25935890.0, "reward": 0.5401857495307922, "reward_std": 0.2600173056125641, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.622322678565979, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.17054879665374756, "step": 109 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.45666666666666667, "calib/avg_num_step_conf": 5.484375, "calib/ece": 0.31175999999999987, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.128, "calib/gap": -0.01839743589743592, "calib/mean_conf": 0.7956, "calib/mu_c": 0.7867692307692308, "calib/mu_w": 0.8051666666666667, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2936799999999999, "calib/std_conf": 0.14804810029176327, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7333526850507982, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.011856181554294687, "calib/step_q_w": 0.7214965034965035, "calib/step_q_w_n": 715.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2667.0, "completions/max_terminated_length": 2667.0, "completions/mean_length": 552.046875, "completions/mean_terminated_length": 554.2117919921875, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.11733333333333333, "grad_norm": 0.004308481700718403, "kl": 0.0877532958984375, "learning_rate": 2.5e-06, "loss": 0.062, "num_tokens": 26182134.0, "reward": 0.549569845199585, "reward_std": 0.41790106892585754, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6282640695571899, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.1740006059408188, "step": 110 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5433070866141732, "calib/avg_num_step_conf": 6.07421875, "calib/ece": 0.3258565737051793, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.14342629482071714, "calib/gap": 0.012054864109728114, "calib/mean_conf": 0.8142231075697212, "calib/mu_c": 0.8203225806451613, "calib/mu_w": 0.8082677165354332, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32302788844621516, "calib/std_conf": 0.11807087320548772, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7442020497803807, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.013708930514325646, "calib/step_q_w": 0.7304931192660551, "calib/step_q_w_n": 872.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2788.0, "completions/max_terminated_length": 2788.0, "completions/mean_length": 589.40234375, "completions/mean_terminated_length": 589.40234375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.1184, "grad_norm": 0.003989416174590588, "kl": 0.0763092041015625, "learning_rate": 2.4722222222222226e-06, "loss": 0.0195, "num_tokens": 26440429.0, "reward": 0.5446423292160034, "reward_std": 0.36422672867774963, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.6271018981933594, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.1684325933456421, "step": 111 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5472689075630253, "calib/avg_num_step_conf": 5.4375, "calib/ece": 0.280161943319838, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.145748987854251, "calib/gap": 0.024548975840336218, "calib/mean_conf": 0.7724696356275305, "calib/mu_c": 0.784296875, "calib/mu_w": 0.7597478991596638, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2672064777327935, "calib/std_conf": 0.1633975495515841, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7214992927864214, "calib/step_q_c_n": 707.0, "calib/step_q_gap": 0.034798562859414184, "calib/step_q_w": 0.6867007299270073, "calib/step_q_w_n": 685.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2989.0, "completions/max_terminated_length": 2989.0, "completions/mean_length": 619.1953125, "completions/mean_terminated_length": 621.6235961914062, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.11946666666666667, "grad_norm": 0.004297351930290461, "kl": 0.07457733154296875, "learning_rate": 2.4444444444444447e-06, "loss": 0.0347, "num_tokens": 26706863.0, "reward": 0.5232102870941162, "reward_std": 0.36772620677948, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6476492285728455, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.10502135753631592, "step": 112 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5253927997030805, "calib/avg_num_step_conf": 5.76171875, "calib/ece": 0.2656078431372549, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.11764705882352941, "calib/gap": 0.015623530867252322, "calib/mean_conf": 0.8015294117647058, "calib/mu_c": 0.8087591240875913, "calib/mu_w": 0.793135593220339, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26494117647058824, "calib/std_conf": 0.12424049305818236, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7267310167310168, "calib/step_q_c_n": 777.0, "calib/step_q_gap": 0.03375107403760702, "calib/step_q_w": 0.6929799426934098, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2699.0, "completions/max_terminated_length": 2699.0, "completions/mean_length": 535.06640625, "completions/mean_terminated_length": 535.06640625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.12053333333333334, "grad_norm": 0.004804201424121857, "kl": 0.0929718017578125, "learning_rate": 2.4166666666666667e-06, "loss": 0.0349, "num_tokens": 26949040.0, "reward": 0.5936443209648132, "reward_std": 0.371960312128067, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6712472438812256, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.2097914069890976, "step": 113 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.562398334309324, "calib/avg_num_step_conf": 5.953125, "calib/ece": 0.25304, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.136, "calib/gap": 0.02785477259418323, "calib/mean_conf": 0.8035999999999999, "calib/mu_c": 0.8157446808510638, "calib/mu_w": 0.7878899082568805, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24631999999999998, "calib/std_conf": 0.118222840432803, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7349100257069409, "calib/step_q_c_n": 778.0, "calib/step_q_gap": 0.026223698629192826, "calib/step_q_w": 0.708686327077748, "calib/step_q_w_n": 746.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2387.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 575.57421875, "completions/mean_terminated_length": 575.57421875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.1216, "grad_norm": 0.004351339768618345, "kl": 0.084808349609375, "learning_rate": 2.388888888888889e-06, "loss": 0.057, "num_tokens": 27201411.0, "reward": 0.6057231426239014, "reward_std": 0.3419015407562256, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6800882816314697, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.22588929533958435, "step": 114 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5503722084367245, "calib/avg_num_step_conf": 5.84765625, "calib/ece": 0.3108661417322835, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.11811023622047244, "calib/gap": 0.018978908188585608, "calib/mean_conf": 0.798503937007874, "calib/mu_c": 0.8077692307692307, "calib/mu_w": 0.7887903225806451, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2987795275590552, "calib/std_conf": 0.1413298968664433, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7183467741935484, "calib/step_q_c_n": 744.0, "calib/step_q_gap": -0.005624009338988212, "calib/step_q_w": 0.7239707835325366, "calib/step_q_w_n": 753.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2196.0, "completions/max_terminated_length": 2196.0, "completions/mean_length": 540.47265625, "completions/mean_terminated_length": 540.47265625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.12266666666666666, "grad_norm": 0.004473676439374685, "kl": 0.0851287841796875, "learning_rate": 2.361111111111111e-06, "loss": 0.0075, "num_tokens": 27445036.0, "reward": 0.5372321605682373, "reward_std": 0.40835297107696533, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6523203253746033, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.12214392423629761, "step": 115 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4656552077331797, "calib/avg_num_step_conf": 6.015625, "calib/ece": 0.3060400000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.124, "calib/gap": 0.008253632930030008, "calib/mean_conf": 0.79964, "calib/mu_c": 0.8037007874015748, "calib/mu_w": 0.7954471544715448, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.29884000000000005, "calib/std_conf": 0.13730357023763076, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7131073446327684, "calib/step_q_c_n": 708.0, "calib/step_q_gap": -0.00922438613646237, "calib/step_q_w": 0.7223317307692307, "calib/step_q_w_n": 832.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2417.0, "completions/max_terminated_length": 2417.0, "completions/mean_length": 610.34765625, "completions/mean_terminated_length": 612.7412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.12373333333333333, "grad_norm": 0.004027099348604679, "kl": 0.07471466064453125, "learning_rate": 2.3333333333333336e-06, "loss": 0.0287, "num_tokens": 27705805.0, "reward": 0.5554635524749756, "reward_std": 0.3605683445930481, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6302086114883423, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.1877496838569641, "step": 116 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5348603070727409, "calib/avg_num_step_conf": 5.96484375, "calib/ece": 0.35620158102766797, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.16996047430830039, "calib/gap": 0.022886735464384578, "calib/mean_conf": 0.808296442687747, "calib/mu_c": 0.8206896551724138, "calib/mu_w": 0.7978029197080292, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.353, "calib/std_conf": 0.1259064645418673, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7259905362776026, "calib/step_q_c_n": 634.0, "calib/step_q_gap": -0.004468590262150962, "calib/step_q_w": 0.7304591265397535, "calib/step_q_w_n": 893.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2424.0, "completions/max_terminated_length": 2424.0, "completions/mean_length": 561.42578125, "completions/mean_terminated_length": 561.42578125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.1248, "grad_norm": 0.004113308619707823, "kl": 0.077239990234375, "learning_rate": 2.305555555555556e-06, "loss": 0.0105, "num_tokens": 27956130.0, "reward": 0.4996691346168518, "reward_std": 0.37606483697891235, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6175527572631836, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.09350422024726868, "step": 117 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5593154915079773, "calib/avg_num_step_conf": 6.54296875, "calib/ece": 0.27880000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.14, "calib/gap": 0.02534096757591353, "calib/mean_conf": 0.7916, "calib/mu_c": 0.8033582089552239, "calib/mu_w": 0.7780172413793104, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.26720000000000005, "calib/std_conf": 0.14713748672585109, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7081600955794505, "calib/step_q_c_n": 837.0, "calib/step_q_gap": 0.03423408125964145, "calib/step_q_w": 0.673926014319809, "calib/step_q_w_n": 838.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 581.3671875, "completions/mean_terminated_length": 585.9448852539062, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.12586666666666665, "grad_norm": 0.004203206859529018, "kl": 0.07547760009765625, "learning_rate": 2.277777777777778e-06, "loss": 0.0018, "num_tokens": 28208968.0, "reward": 0.5645748376846313, "reward_std": 0.35757777094841003, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6591734290122986, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.17075751721858978, "step": 118 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5735979836168872, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.2559288537549407, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.1225296442687747, "calib/gap": 0.05443478260869561, "calib/mean_conf": 0.7843873517786562, "calib/mu_c": 0.8091304347826087, "calib/mu_w": 0.7546956521739131, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24743083003952565, "calib/std_conf": 0.15600534651228387, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7256300268096514, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.03079323155742586, "calib/step_q_w": 0.6948367952522255, "calib/step_q_w_n": 674.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2863.0, "completions/max_terminated_length": 2863.0, "completions/mean_length": 606.953125, "completions/mean_terminated_length": 606.953125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.12693333333333334, "grad_norm": 0.004040359519422054, "kl": 0.07463836669921875, "learning_rate": 2.25e-06, "loss": -0.0244, "num_tokens": 28469412.0, "reward": 0.5990918278694153, "reward_std": 0.38760697841644287, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6894567012786865, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.20325830578804016, "step": 119 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.49453114579165, "calib/avg_num_step_conf": 5.5546875, "calib/ece": 0.20059760956175296, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.07569721115537849, "calib/gap": 0.00023609443777505845, "calib/mean_conf": 0.7691235059760957, "calib/mu_c": 0.7692156862745098, "calib/mu_w": 0.7689795918367347, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18007968127490037, "calib/std_conf": 0.155783678437086, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6988690476190476, "calib/step_q_c_n": 840.0, "calib/step_q_gap": -0.036732326951399163, "calib/step_q_w": 0.7356013745704467, "calib/step_q_w_n": 582.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 566.359375, "completions/mean_terminated_length": 566.359375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.128, "grad_norm": 0.00447692908346653, "kl": 0.07639312744140625, "learning_rate": 2.222222222222222e-06, "loss": 0.0449, "num_tokens": 28721088.0, "reward": 0.6634672284126282, "reward_std": 0.33212563395500183, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6975241899490356, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.31456655263900757, "step": 120 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4832033788174139, "calib/avg_num_step_conf": 6.30859375, "calib/ece": 0.2799196787148594, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.0963855421686747, "calib/gap": 0.0013664717348929045, "calib/mean_conf": 0.7843373493975903, "calib/mu_c": 0.784962962962963, "calib/mu_w": 0.7835964912280701, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2610441767068273, "calib/std_conf": 0.14496247489302483, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7130761664564943, "calib/step_q_c_n": 793.0, "calib/step_q_gap": -0.0034116680933840016, "calib/step_q_w": 0.7164878345498783, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2724.0, "completions/max_terminated_length": 2724.0, "completions/mean_length": 630.87109375, "completions/mean_terminated_length": 638.351806640625, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.12906666666666666, "grad_norm": 0.00422104774042964, "kl": 0.0763702392578125, "learning_rate": 2.1944444444444445e-06, "loss": 0.0163, "num_tokens": 28987647.0, "reward": 0.6108550429344177, "reward_std": 0.3837067186832428, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6519953012466431, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.2704959511756897, "step": 121 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5428459119496855, "calib/avg_num_step_conf": 5.67578125, "calib/ece": 0.20047058823529418, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.12156862745098039, "calib/gap": 0.0361871069182389, "calib/mean_conf": 0.7963137254901961, "calib/mu_c": 0.809937106918239, "calib/mu_w": 0.77375, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1866274509803922, "calib/std_conf": 0.13981689569367087, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7348118586088939, "calib/step_q_c_n": 877.0, "calib/step_q_gap": 0.02901324749778278, "calib/step_q_w": 0.7057986111111111, "calib/step_q_w_n": 576.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2533.0, "completions/max_terminated_length": 2533.0, "completions/mean_length": 553.546875, "completions/mean_terminated_length": 553.546875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.13013333333333332, "grad_norm": 0.004603228531777859, "kl": 0.07869720458984375, "learning_rate": 2.166666666666667e-06, "loss": -0.0038, "num_tokens": 29236699.0, "reward": 0.6796014308929443, "reward_std": 0.33652737736701965, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.729982852935791, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.3057824969291687, "step": 122 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.49417317708333336, "calib/avg_num_step_conf": 6.140625, "calib/ece": 0.2578225806451613, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.0846774193548387, "calib/gap": 0.01717187500000006, "calib/mean_conf": 0.7616129032258064, "calib/mu_c": 0.769921875, "calib/mu_w": 0.7527499999999999, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2516532258064516, "calib/std_conf": 0.16044488098255663, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7184172661870504, "calib/step_q_c_n": 695.0, "calib/step_q_gap": 0.010948623085567988, "calib/step_q_w": 0.7074686431014824, "calib/step_q_w_n": 877.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2781.0, "completions/max_terminated_length": 2781.0, "completions/mean_length": 638.91796875, "completions/mean_terminated_length": 641.423583984375, "completions/min_length": 0.0, "completions/min_terminated_length": 210.0, "epoch": 0.1312, "grad_norm": 0.00413132319226861, "kl": 0.07390594482421875, "learning_rate": 2.138888888888889e-06, "loss": 0.0571, "num_tokens": 29505550.0, "reward": 0.5470514297485352, "reward_std": 0.35542139410972595, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6500972509384155, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.15103670954704285, "step": 123 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5037012987012988, "calib/avg_num_step_conf": 5.84765625, "calib/ece": 0.21070866141732283, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.07874015748031496, "calib/gap": -0.0017311688311688611, "calib/mean_conf": 0.7968503937007875, "calib/mu_c": 0.7961688311688312, "calib/mu_w": 0.7979, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2006299212598425, "calib/std_conf": 0.1221571260497188, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7401129943502824, "calib/step_q_c_n": 885.0, "calib/step_q_gap": -0.0028281821203057778, "calib/step_q_w": 0.7429411764705882, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 561.4140625, "completions/mean_terminated_length": 563.61572265625, "completions/min_length": 0.0, "completions/min_terminated_length": 88.0, "epoch": 0.13226666666666667, "grad_norm": 0.004335741512477398, "kl": 0.07759857177734375, "learning_rate": 2.1111111111111114e-06, "loss": 0.0041, "num_tokens": 29756088.0, "reward": 0.6853257417678833, "reward_std": 0.3310573697090149, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7027504444122314, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.34836989641189575, "step": 124 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5575951825951827, "calib/avg_num_step_conf": 5.6015625, "calib/ece": 0.33441767068273087, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.10843373493975904, "calib/gap": 0.02896270396270406, "calib/mean_conf": 0.7895180722891566, "calib/mu_c": 0.804871794871795, "calib/mu_w": 0.775909090909091, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3270281124497991, "calib/std_conf": 0.14176090183068885, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7411801242236025, "calib/step_q_c_n": 644.0, "calib/step_q_gap": 0.026273288780564563, "calib/step_q_w": 0.7149068354430379, "calib/step_q_w_n": 790.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2795.0, "completions/max_terminated_length": 2795.0, "completions/mean_length": 611.23828125, "completions/mean_terminated_length": 611.23828125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.13333333333333333, "grad_norm": 0.004059802275151014, "kl": 0.07111358642578125, "learning_rate": 2.0833333333333334e-06, "loss": 0.0432, "num_tokens": 30017373.0, "reward": 0.5046956539154053, "reward_std": 0.35496553778648376, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.6204128861427307, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.10460340976715088, "step": 125 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5254450464396285, "calib/avg_num_step_conf": 5.88671875, "calib/ece": 0.26948, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.144, "calib/gap": 0.01557404540763685, "calib/mean_conf": 0.7735599999999999, "calib/mu_c": 0.7806617647058823, "calib/mu_w": 0.7650877192982455, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24951999999999996, "calib/std_conf": 0.17800372580370333, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7304551365409622, "calib/step_q_c_n": 769.0, "calib/step_q_gap": 0.040184133830935, "calib/step_q_w": 0.6902710027100272, "calib/step_q_w_n": 738.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2935.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 597.98046875, "completions/mean_terminated_length": 600.3255004882812, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.1344, "grad_norm": 0.004169043619185686, "kl": 0.06622314453125, "learning_rate": 2.0555555555555555e-06, "loss": 0.0089, "num_tokens": 30275920.0, "reward": 0.5446716547012329, "reward_std": 0.34021419286727905, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6581730246543884, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.13038893043994904, "step": 126 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5178362573099415, "calib/avg_num_step_conf": 5.97265625, "calib/ece": 0.3217269076305221, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.09236947791164658, "calib/gap": 0.025413255360623888, "calib/mean_conf": 0.7757831325301205, "calib/mu_c": 0.789561403508772, "calib/mu_w": 0.7641481481481481, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.31983935742971886, "calib/std_conf": 0.14432048630633593, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7253539823008849, "calib/step_q_c_n": 678.0, "calib/step_q_gap": 0.02907901167808824, "calib/step_q_w": 0.6962749706227966, "calib/step_q_w_n": 851.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 576.84375, "completions/mean_terminated_length": 579.1058959960938, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.13546666666666668, "grad_norm": 0.004571571014821529, "kl": 0.07588958740234375, "learning_rate": 2.027777777777778e-06, "loss": 0.0189, "num_tokens": 30527264.0, "reward": 0.49010002613067627, "reward_std": 0.3242504596710205, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.6249051094055176, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.07170122116804123, "step": 127 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5613778298204528, "calib/avg_num_step_conf": 5.0859375, "calib/ece": 0.27471774193548393, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.07258064516129033, "calib/gap": 0.04295472287275559, "calib/mean_conf": 0.76125, "calib/mu_c": 0.7823809523809523, "calib/mu_w": 0.7394262295081967, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2639516129032259, "calib/std_conf": 0.15535745073861118, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7149917491749175, "calib/step_q_c_n": 606.0, "calib/step_q_gap": 0.04333945032434283, "calib/step_q_w": 0.6716522988505746, "calib/step_q_w_n": 696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2431.0, "completions/max_terminated_length": 2431.0, "completions/mean_length": 569.40234375, "completions/mean_terminated_length": 576.1541748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.13653333333333334, "grad_norm": 0.004363252781331539, "kl": 0.074676513671875, "learning_rate": 2.0000000000000003e-06, "loss": 0.0028, "num_tokens": 30779695.0, "reward": 0.5591524839401245, "reward_std": 0.387980580329895, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6609953045845032, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.1659032702445984, "step": 128 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5172991071428571, "calib/avg_num_step_conf": 5.6328125, "calib/ece": 0.231484375, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.08984375, "calib/gap": 0.004444444444444362, "calib/mean_conf": 0.780625, "calib/mu_c": 0.7825694444444444, "calib/mu_w": 0.7781250000000001, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22480468750000002, "calib/std_conf": 0.12434358899838785, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7359466019417477, "calib/step_q_c_n": 824.0, "calib/step_q_gap": 0.0282119741100324, "calib/step_q_w": 0.7077346278317153, "calib/step_q_w_n": 618.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 508.23046875, "completions/mean_terminated_length": 510.22357177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.1376, "grad_norm": 0.004571618977934122, "kl": 0.08074951171875, "learning_rate": 1.9722222222222224e-06, "loss": 0.0073, "num_tokens": 31012186.0, "reward": 0.6112266182899475, "reward_std": 0.3370286226272583, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6892741918563843, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.2214602380990982, "step": 129 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5843045843045843, "calib/avg_num_step_conf": 5.53515625, "calib/ece": 0.1876470588235294, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.07058823529411765, "calib/gap": 0.033640248640248616, "calib/mean_conf": 0.7788627450980393, "calib/mu_c": 0.791923076923077, "calib/mu_w": 0.7582828282828283, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17737254901960783, "calib/std_conf": 0.13884072121310173, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7269721577726218, "calib/step_q_c_n": 862.0, "calib/step_q_gap": 0.005963148763612813, "calib/step_q_w": 0.721009009009009, "calib/step_q_w_n": 555.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 497.7421875, "completions/mean_terminated_length": 499.69415283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.13866666666666666, "grad_norm": 0.004695298615843058, "kl": 0.08007049560546875, "learning_rate": 1.944444444444445e-06, "loss": -0.0013, "num_tokens": 31244896.0, "reward": 0.6643092632293701, "reward_std": 0.2820330858230591, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7284159660339355, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.27910876274108887, "step": 130 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5473628968774601, "calib/avg_num_step_conf": 5.40234375, "calib/ece": 0.3800398406374502, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.06772908366533864, "calib/gap": 0.010362109682498066, "calib/mean_conf": 0.766414342629482, "calib/mu_c": 0.7725242718446602, "calib/mu_w": 0.7621621621621621, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36804780876494025, "calib/std_conf": 0.14329882560505158, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7322448979591837, "calib/step_q_c_n": 588.0, "calib/step_q_gap": 0.029490180978051672, "calib/step_q_w": 0.702754716981132, "calib/step_q_w_n": 795.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2650.0, "completions/max_terminated_length": 2650.0, "completions/mean_length": 548.11328125, "completions/mean_terminated_length": 550.2627563476562, "completions/min_length": 0.0, "completions/min_terminated_length": 195.0, "epoch": 0.13973333333333332, "grad_norm": 0.004741629585623741, "kl": 0.07027435302734375, "learning_rate": 1.916666666666667e-06, "loss": -0.0036, "num_tokens": 31491421.0, "reward": 0.47297048568725586, "reward_std": 0.30465346574783325, "rewards/accuracy_reward_step": 0.40234375, "rewards/final_brier_reward_step": 0.6037136912345886, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.06566473841667175, "step": 131 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5562135922330097, "calib/avg_num_step_conf": 6.23046875, "calib/ece": 0.18948616600790508, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.06324110671936758, "calib/gap": 0.0357197411003235, "calib/mean_conf": 0.7773913043478261, "calib/mu_c": 0.7919333333333333, "calib/mu_w": 0.7562135922330098, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18699604743082998, "calib/std_conf": 0.1312523782712374, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.733171974522293, "calib/step_q_c_n": 785.0, "calib/step_q_gap": 0.07576456711488555, "calib/step_q_w": 0.6574074074074074, "calib/step_q_w_n": 810.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 547.80859375, "completions/mean_terminated_length": 549.9569091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.1408, "grad_norm": 0.004259465262293816, "kl": 0.0728912353515625, "learning_rate": 1.888888888888889e-06, "loss": 0.0205, "num_tokens": 31737252.0, "reward": 0.6615229845046997, "reward_std": 0.3438699543476105, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7125101685523987, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.2964732348918915, "step": 132 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5110264900662251, "calib/avg_num_step_conf": 5.83203125, "calib/ece": 0.38274900398406375, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.09561752988047809, "calib/gap": 0.019666225165562823, "calib/mean_conf": 0.7677689243027888, "calib/mu_c": 0.7796, "calib/mu_w": 0.7599337748344371, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.37605577689243025, "calib/std_conf": 0.1573808358108324, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7125268817204302, "calib/step_q_c_n": 558.0, "calib/step_q_gap": 0.02055896728192752, "calib/step_q_w": 0.6919679144385027, "calib/step_q_w_n": 935.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2411.0, "completions/max_terminated_length": 2411.0, "completions/mean_length": 629.44921875, "completions/mean_terminated_length": 634.405517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 217.0, "epoch": 0.14186666666666667, "grad_norm": 0.004305138718336821, "kl": 0.06676483154296875, "learning_rate": 1.8611111111111113e-06, "loss": 0.008, "num_tokens": 32004735.0, "reward": 0.44489115476608276, "reward_std": 0.4102109670639038, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.59170001745224, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.025426030158996582, "step": 133 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6138782051282051, "calib/avg_num_step_conf": 5.51171875, "calib/ece": 0.30723999999999996, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.092, "calib/gap": 0.049352564102564056, "calib/mean_conf": 0.78492, "calib/mu_c": 0.8105833333333333, "calib/mu_w": 0.7612307692307693, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.30607999999999996, "calib/std_conf": 0.12528445075108083, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7180841121495327, "calib/step_q_c_n": 642.0, "calib/step_q_gap": 0.03820114726006585, "calib/step_q_w": 0.6798829648894669, "calib/step_q_w_n": 769.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1600.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 631.734375, "completions/mean_terminated_length": 634.2117919921875, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.14293333333333333, "grad_norm": 0.004371109418570995, "kl": 0.0604248046875, "learning_rate": 1.8333333333333333e-06, "loss": -0.0015, "num_tokens": 32275411.0, "reward": 0.5322444438934326, "reward_std": 0.4109966456890106, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6493402719497681, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.1268673986196518, "step": 134 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5147754137115839, "calib/avg_num_step_conf": 5.90625, "calib/ece": 0.2445882352941177, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.10588235294117647, "calib/gap": 0.009561403508771882, "calib/mean_conf": 0.7823921568627451, "calib/mu_c": 0.7866666666666666, "calib/mu_w": 0.7771052631578947, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23701960784313728, "calib/std_conf": 0.12859178128090068, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7156049382716049, "calib/step_q_c_n": 810.0, "calib/step_q_gap": 0.01512060778727442, "calib/step_q_w": 0.7004843304843305, "calib/step_q_w_n": 702.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1789.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 570.97265625, "completions/mean_terminated_length": 573.2117919921875, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.144, "grad_norm": 0.004456392489373684, "kl": 0.06621551513671875, "learning_rate": 1.8055555555555557e-06, "loss": 0.0222, "num_tokens": 32527460.0, "reward": 0.6077929735183716, "reward_std": 0.3696410655975342, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6856574416160583, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.22055339813232422, "step": 135 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6299169859514687, "calib/avg_num_step_conf": 6.046875, "calib/ece": 0.3193227091633467, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.06374501992031872, "calib/gap": 0.06505810983397176, "calib/mean_conf": 0.7814741035856574, "calib/mu_c": 0.8164655172413792, "calib/mu_w": 0.7514074074074074, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3193227091633467, "calib/std_conf": 0.13342193476332617, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7216311874105866, "calib/step_q_c_n": 699.0, "calib/step_q_gap": 0.03562412027277739, "calib/step_q_w": 0.6860070671378092, "calib/step_q_w_n": 849.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2287.0, "completions/max_terminated_length": 2287.0, "completions/mean_length": 572.515625, "completions/mean_terminated_length": 574.7608032226562, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.14506666666666668, "grad_norm": 0.0045397537760436535, "kl": 0.06781768798828125, "learning_rate": 1.777777777777778e-06, "loss": 0.0246, "num_tokens": 32782512.0, "reward": 0.5561786890029907, "reward_std": 0.34354400634765625, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6489378809928894, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.17748203873634338, "step": 136 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5825439361834726, "calib/avg_num_step_conf": 6.1640625, "calib/ece": 0.2304313725490197, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.07058823529411765, "calib/gap": 0.032907889816776725, "calib/mean_conf": 0.7744313725490196, "calib/mu_c": 0.7890140845070422, "calib/mu_w": 0.7561061946902655, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2240000000000001, "calib/std_conf": 0.13749142645669404, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7308179723502305, "calib/step_q_c_n": 868.0, "calib/step_q_gap": 0.04591656389952625, "calib/step_q_w": 0.6849014084507042, "calib/step_q_w_n": 710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 565.7890625, "completions/mean_terminated_length": 568.0078735351562, "completions/min_length": 0.0, "completions/min_terminated_length": 207.0, "epoch": 0.14613333333333334, "grad_norm": 0.004167406819760799, "kl": 0.06789398193359375, "learning_rate": 1.75e-06, "loss": 0.035, "num_tokens": 33034338.0, "reward": 0.6721287369728088, "reward_std": 0.3198203444480896, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6990191340446472, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.3358633518218994, "step": 137 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5444679544058426, "calib/avg_num_step_conf": 5.47265625, "calib/ece": 0.15186507936507934, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.06746031746031746, "calib/gap": 0.02965599617773551, "calib/mean_conf": 0.7634523809523809, "calib/mu_c": 0.7741614906832298, "calib/mu_w": 0.7445054945054943, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1382142857142857, "calib/std_conf": 0.13909994734533124, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7092874396135265, "calib/step_q_c_n": 828.0, "calib/step_q_gap": 0.04131187242329959, "calib/step_q_w": 0.6679755671902269, "calib/step_q_w_n": 573.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2761.0, "completions/max_terminated_length": 2761.0, "completions/mean_length": 547.69140625, "completions/mean_terminated_length": 549.8392333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.1472, "grad_norm": 0.004642318934202194, "kl": 0.06908416748046875, "learning_rate": 1.7222222222222224e-06, "loss": 0.0184, "num_tokens": 33278883.0, "reward": 0.6913992166519165, "reward_std": 0.3592164218425751, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7364199161529541, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3237221837043762, "step": 138 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5907372400756143, "calib/avg_num_step_conf": 5.41796875, "calib/ece": 0.16612648221343873, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.03557312252964427, "calib/gap": 0.03127329192546602, "calib/mean_conf": 0.7714229249011858, "calib/mu_c": 0.7827950310559006, "calib/mu_w": 0.7515217391304346, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1505928853754941, "calib/std_conf": 0.12221745259785197, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7193537015276147, "calib/step_q_c_n": 851.0, "calib/step_q_gap": 0.021909671676868303, "calib/step_q_w": 0.6974440298507464, "calib/step_q_w_n": 536.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2808.0, "completions/max_terminated_length": 2808.0, "completions/mean_length": 549.0390625, "completions/mean_terminated_length": 549.0390625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.14826666666666666, "grad_norm": 0.004201711155474186, "kl": 0.06937408447265625, "learning_rate": 1.6944444444444446e-06, "loss": 0.013, "num_tokens": 33522533.0, "reward": 0.7442376613616943, "reward_std": 0.28208547830581665, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7411026954650879, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.42393502593040466, "step": 139 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6013111268603828, "calib/avg_num_step_conf": 5.30078125, "calib/ece": 0.12808764940239045, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.07569721115537849, "calib/gap": 0.04478454996456416, "calib/mean_conf": 0.7806772908366534, "calib/mu_c": 0.7958433734939759, "calib/mu_w": 0.7510588235294118, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12370517928286853, "calib/std_conf": 0.12407080712845006, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7337, "calib/step_q_c_n": 900.0, "calib/step_q_gap": 0.0634374179431072, "calib/step_q_w": 0.6702625820568928, "calib/step_q_w_n": 457.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1905.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 534.484375, "completions/mean_terminated_length": 538.6929321289062, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.14933333333333335, "grad_norm": 0.004485068377107382, "kl": 0.07003021240234375, "learning_rate": 1.6666666666666667e-06, "loss": -0.0128, "num_tokens": 33764377.0, "reward": 0.7021718621253967, "reward_std": 0.32926690578460693, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7483788728713989, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.3309648036956787, "step": 140 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.62610668032487, "calib/avg_num_step_conf": 5.51953125, "calib/ece": 0.08313492063492071, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.06349206349206349, "calib/gap": 0.06611472890905079, "calib/mean_conf": 0.7601984126984127, "calib/mu_c": 0.7809248554913294, "calib/mu_w": 0.7148101265822786, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.0784126984126985, "calib/std_conf": 0.13927092016580372, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.69957805907173, "calib/step_q_c_n": 948.0, "calib/step_q_gap": 0.01944902681366545, "calib/step_q_w": 0.6801290322580645, "calib/step_q_w_n": 465.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2821.0, "completions/max_terminated_length": 2821.0, "completions/mean_length": 582.05078125, "completions/mean_terminated_length": 584.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 231.0, "epoch": 0.1504, "grad_norm": 0.004221404902637005, "kl": 0.06427001953125, "learning_rate": 1.638888888888889e-06, "loss": 0.0391, "num_tokens": 34020478.0, "reward": 0.7531248331069946, "reward_std": 0.33923524618148804, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7742726802825928, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.40072697401046753, "step": 141 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5905033238366572, "calib/avg_num_step_conf": 5.61328125, "calib/ece": 0.25210317460317466, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.10714285714285714, "calib/gap": 0.044119658119658234, "calib/mean_conf": 0.7699603174603175, "calib/mu_c": 0.7904444444444445, "calib/mu_w": 0.7463247863247863, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24317460317460327, "calib/std_conf": 0.1531015277205796, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7170850767085079, "calib/step_q_c_n": 717.0, "calib/step_q_gap": 0.03491841004184115, "calib/step_q_w": 0.6821666666666667, "calib/step_q_w_n": 720.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2188.0, "completions/max_terminated_length": 2188.0, "completions/mean_length": 600.36328125, "completions/mean_terminated_length": 600.36328125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.15146666666666667, "grad_norm": 0.004155976232141256, "kl": 0.06311798095703125, "learning_rate": 1.6111111111111113e-06, "loss": 0.0204, "num_tokens": 34279331.0, "reward": 0.6085727214813232, "reward_std": 0.2957516610622406, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6840535402297974, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.23074819147586823, "step": 142 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5991659821823466, "calib/avg_num_step_conf": 5.61328125, "calib/ece": 0.23373015873015876, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.015873015873015872, "calib/gap": 0.042750995134896, "calib/mean_conf": 0.7564285714285715, "calib/mu_c": 0.7766165413533834, "calib/mu_w": 0.7338655462184874, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.23119047619047625, "calib/std_conf": 0.1319148612258865, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7176137931034483, "calib/step_q_c_n": 725.0, "calib/step_q_gap": 0.056118357710189914, "calib/step_q_w": 0.6614954353932584, "calib/step_q_w_n": 712.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2900.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 572.58203125, "completions/mean_terminated_length": 574.8275146484375, "completions/min_length": 0.0, "completions/min_terminated_length": 235.0, "epoch": 0.15253333333333333, "grad_norm": 0.004387116525322199, "kl": 0.06169891357421875, "learning_rate": 1.5833333333333333e-06, "loss": -0.0056, "num_tokens": 34533248.0, "reward": 0.63039231300354, "reward_std": 0.31463029980659485, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6914234161376953, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.2685798406600952, "step": 143 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5462456655108029, "calib/avg_num_step_conf": 5.55078125, "calib/ece": 0.1612549019607843, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.07450980392156863, "calib/gap": 0.023340890904241296, "calib/mean_conf": 0.7551372549019608, "calib/mu_c": 0.763558282208589, "calib/mu_w": 0.7402173913043477, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13858823529411762, "calib/std_conf": 0.15238200444520655, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7097377049180328, "calib/step_q_c_n": 915.0, "calib/step_q_gap": 0.03125944404846748, "calib/step_q_w": 0.6784782608695653, "calib/step_q_w_n": 506.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 536.90625, "completions/mean_terminated_length": 536.90625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.1536, "grad_norm": 0.005076371133327484, "kl": 0.0737457275390625, "learning_rate": 1.5555555555555558e-06, "loss": 0.002, "num_tokens": 34774824.0, "reward": 0.6651709079742432, "reward_std": 0.3426276743412018, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7367910146713257, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.26776954531669617, "step": 144 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5868158711295967, "calib/avg_num_step_conf": 5.82421875, "calib/ece": 0.19749999999999998, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0873015873015873, "calib/gap": 0.030166369578134455, "calib/mean_conf": 0.7871031746031746, "calib/mu_c": 0.7989542483660131, "calib/mu_w": 0.7687878787878787, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1887301587301587, "calib/std_conf": 0.11359887165247524, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7066727272727272, "calib/step_q_c_n": 847.0, "calib/step_q_gap": 0.0201199322416713, "calib/step_q_w": 0.6865527950310559, "calib/step_q_w_n": 644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2102.0, "completions/max_terminated_length": 2102.0, "completions/mean_length": 545.01953125, "completions/mean_terminated_length": 547.1569213867188, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.15466666666666667, "grad_norm": 0.004416238516569138, "kl": 0.06328582763671875, "learning_rate": 1.527777777777778e-06, "loss": 0.0301, "num_tokens": 35017053.0, "reward": 0.6805466413497925, "reward_std": 0.37044334411621094, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7191644906997681, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3255225419998169, "step": 145 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5401602564102564, "calib/avg_num_step_conf": 5.76171875, "calib/ece": 0.3114399999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.072, "calib/gap": 0.01771153846153839, "calib/mean_conf": 0.78904, "calib/mu_c": 0.7982499999999999, "calib/mu_w": 0.7805384615384615, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3102399999999999, "calib/std_conf": 0.11241298145676949, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7241643059490085, "calib/step_q_c_n": 706.0, "calib/step_q_gap": 0.023071978250698844, "calib/step_q_w": 0.7010923276983096, "calib/step_q_w_n": 769.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2467.0, "completions/max_terminated_length": 2467.0, "completions/mean_length": 556.53515625, "completions/mean_terminated_length": 563.1343994140625, "completions/min_length": 0.0, "completions/min_terminated_length": 217.0, "epoch": 0.15573333333333333, "grad_norm": 0.004498027730733156, "kl": 0.06241607666015625, "learning_rate": 1.5e-06, "loss": 0.0025, "num_tokens": 35266742.0, "reward": 0.5354191064834595, "reward_std": 0.38119083642959595, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6358391046524048, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.14593657851219177, "step": 146 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5771493212669684, "calib/avg_num_step_conf": 6.05859375, "calib/ece": 0.3080321285140561, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.05622489959839357, "calib/gap": 0.027821590174531008, "calib/mean_conf": 0.7779116465863454, "calib/mu_c": 0.7924369747899157, "calib/mu_w": 0.7646153846153847, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.30401606425702804, "calib/std_conf": 0.12988586364651236, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7223343373493976, "calib/step_q_c_n": 664.0, "calib/step_q_gap": 0.036347866097988235, "calib/step_q_w": 0.6859864712514093, "calib/step_q_w_n": 887.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3035.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 591.16015625, "completions/mean_terminated_length": 593.4784545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.1568, "grad_norm": 0.004279103595763445, "kl": 0.059326171875, "learning_rate": 1.4722222222222225e-06, "loss": 0.0406, "num_tokens": 35521759.0, "reward": 0.4943665862083435, "reward_std": 0.3192962110042572, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6381163597106934, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.0638980120420456, "step": 147 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5623113854595336, "calib/avg_num_step_conf": 5.30859375, "calib/ece": 0.16972222222222225, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.11507936507936507, "calib/gap": 0.01374074074074072, "calib/mean_conf": 0.7775000000000001, "calib/mu_c": 0.7824074074074074, "calib/mu_w": 0.7686666666666667, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1521825396825397, "calib/std_conf": 0.13714962574236156, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7106971153846153, "calib/step_q_c_n": 832.0, "calib/step_q_gap": -0.0025097157349293076, "calib/step_q_w": 0.7132068311195446, "calib/step_q_w_n": 527.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3060.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 529.66796875, "completions/mean_terminated_length": 529.66796875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.15786666666666666, "grad_norm": 0.004526691976934671, "kl": 0.06888580322265625, "learning_rate": 1.4444444444444445e-06, "loss": 0.0661, "num_tokens": 35762466.0, "reward": 0.7067558169364929, "reward_std": 0.2901526391506195, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7282199263572693, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3618541955947876, "step": 148 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5815457725562669, "calib/avg_num_step_conf": 5.3828125, "calib/ece": 0.21729838709677418, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.07258064516129033, "calib/gap": 0.05388621213121281, "calib/mean_conf": 0.7728629032258065, "calib/mu_c": 0.7965467625899281, "calib/mu_w": 0.7426605504587153, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.21483870967741936, "calib/std_conf": 0.14994982142840313, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7293567954220316, "calib/step_q_c_n": 699.0, "calib/step_q_gap": 0.059842804258555815, "calib/step_q_w": 0.6695139911634758, "calib/step_q_w_n": 679.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2612.0, "completions/max_terminated_length": 2612.0, "completions/mean_length": 627.4296875, "completions/mean_terminated_length": 627.4296875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.15893333333333334, "grad_norm": 0.004020726308226585, "kl": 0.06133270263671875, "learning_rate": 1.4166666666666667e-06, "loss": 0.0344, "num_tokens": 36027544.0, "reward": 0.6291335821151733, "reward_std": 0.30805811285972595, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6803281307220459, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.2787202000617981, "step": 149 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6293198113816937, "calib/avg_num_step_conf": 5.30859375, "calib/ece": 0.26472, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.12, "calib/gap": 0.037041534784574615, "calib/mean_conf": 0.80552, "calib/mu_c": 0.8222627737226278, "calib/mu_w": 0.7852212389380532, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.26112, "calib/std_conf": 0.11956558702235356, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.71859375, "calib/step_q_c_n": 704.0, "calib/step_q_gap": 0.02201665076335879, "calib/step_q_w": 0.6965770992366412, "calib/step_q_w_n": 655.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1897.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 496.05078125, "completions/mean_terminated_length": 497.99609375, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.16, "grad_norm": 0.0046783071011304855, "kl": 0.0672607421875, "learning_rate": 1.3888888888888892e-06, "loss": 0.0248, "num_tokens": 36259493.0, "reward": 0.6306412220001221, "reward_std": 0.32415205240249634, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6738687753677368, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.28507000207901, "step": 150 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5477752057613168, "calib/avg_num_step_conf": 5.30859375, "calib/ece": 0.36337301587301585, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0873015873015873, "calib/gap": 0.02909722222222233, "calib/mean_conf": 0.7883730158730159, "calib/mu_c": 0.8050000000000002, "calib/mu_w": 0.7759027777777778, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36158730158730157, "calib/std_conf": 0.1212244562605517, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7051403508771931, "calib/step_q_c_n": 570.0, "calib/step_q_gap": -0.02286978853979038, "calib/step_q_w": 0.7280101394169834, "calib/step_q_w_n": 789.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2477.0, "completions/max_terminated_length": 2477.0, "completions/mean_length": 567.50390625, "completions/mean_terminated_length": 571.972412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 191.0, "epoch": 0.16106666666666666, "grad_norm": 0.0045959255658090115, "kl": 0.05704498291015625, "learning_rate": 1.3611111111111112e-06, "loss": 0.0222, "num_tokens": 36511798.0, "reward": 0.4811530113220215, "reward_std": 0.3167354166507721, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.6115511655807495, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.0702861025929451, "step": 151 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5257395313100269, "calib/avg_num_step_conf": 5.3046875, "calib/ece": 0.33665338645418336, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.099601593625498, "calib/gap": 0.014992956844666394, "calib/mean_conf": 0.78199203187251, "calib/mu_c": 0.7901754385964912, "calib/mu_w": 0.7751824817518248, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3322310756972113, "calib/std_conf": 0.1295103248366623, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7425299145299146, "calib/step_q_c_n": 585.0, "calib/step_q_gap": 0.045311285810638924, "calib/step_q_w": 0.6972186287192756, "calib/step_q_w_n": 773.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2924.0, "completions/max_terminated_length": 2924.0, "completions/mean_length": 557.63671875, "completions/mean_terminated_length": 557.63671875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.16213333333333332, "grad_norm": 0.004607094451785088, "kl": 0.061553955078125, "learning_rate": 1.3333333333333334e-06, "loss": 0.0075, "num_tokens": 36759945.0, "reward": 0.5449575781822205, "reward_std": 0.35911184549331665, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.6194121241569519, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.18612799048423767, "step": 152 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.49685656879901485, "calib/avg_num_step_conf": 5.2421875, "calib/ece": 0.24752000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.076, "calib/gap": 0.002478449672694283, "calib/mean_conf": 0.7833600000000002, "calib/mu_c": 0.7844604316546763, "calib/mu_w": 0.781981981981982, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.23744000000000004, "calib/std_conf": 0.1306901312265008, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7036757532281205, "calib/step_q_c_n": 697.0, "calib/step_q_gap": 0.013009086561453831, "calib/step_q_w": 0.6906666666666667, "calib/step_q_w_n": 645.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2376.0, "completions/max_terminated_length": 2376.0, "completions/mean_length": 568.30859375, "completions/mean_terminated_length": 568.30859375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.1632, "grad_norm": 0.00460352748632431, "kl": 0.06215667724609375, "learning_rate": 1.3055555555555556e-06, "loss": 0.0523, "num_tokens": 37012752.0, "reward": 0.6116618514060974, "reward_std": 0.3292079567909241, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.669518768787384, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2498987466096878, "step": 153 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5392928535732133, "calib/avg_num_step_conf": 5.1953125, "calib/ece": 0.3261811023622046, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.06692913385826772, "calib/gap": 0.01358945527236377, "calib/mean_conf": 0.7764960629921259, "calib/mu_c": 0.7838793103448276, "calib/mu_w": 0.7702898550724638, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3229921259842518, "calib/std_conf": 0.13031430049550902, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7374398625429552, "calib/step_q_c_n": 582.0, "calib/step_q_gap": 0.03261365933439908, "calib/step_q_w": 0.7048262032085562, "calib/step_q_w_n": 748.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2537.0, "completions/max_terminated_length": 2537.0, "completions/mean_length": 543.01953125, "completions/mean_terminated_length": 543.01953125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.16426666666666667, "grad_norm": 0.004732536617666483, "kl": 0.0688323974609375, "learning_rate": 1.2777777777777779e-06, "loss": 0.0078, "num_tokens": 37256205.0, "reward": 0.5141885280609131, "reward_std": 0.3324345648288727, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.634368360042572, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.10494619607925415, "step": 154 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.47309670136711396, "calib/avg_num_step_conf": 5.16015625, "calib/ece": 0.3157312252964428, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.05928853754940711, "calib/gap": -0.007140348676784036, "calib/mean_conf": 0.7693280632411067, "calib/mu_c": 0.765546218487395, "calib/mu_w": 0.772686567164179, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3073517786561266, "calib/std_conf": 0.1342345562436322, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6943515358361775, "calib/step_q_c_n": 586.0, "calib/step_q_gap": -8.383831348246495e-05, "calib/step_q_w": 0.69443537414966, "calib/step_q_w_n": 735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1818.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 498.0859375, "completions/mean_terminated_length": 502.00787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.16533333333333333, "grad_norm": 0.004977459087967873, "kl": 0.069793701171875, "learning_rate": 1.25e-06, "loss": -0.0006, "num_tokens": 37490931.0, "reward": 0.5263241529464722, "reward_std": 0.36180227994918823, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6324187517166138, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.1296045333147049, "step": 155 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5958965209634255, "calib/avg_num_step_conf": 5.5234375, "calib/ece": 0.23553784860557767, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.07569721115537849, "calib/gap": 0.06108449088823753, "calib/mean_conf": 0.7654183266932271, "calib/mu_c": 0.7941353383458647, "calib/mu_w": 0.7330508474576272, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23553784860557767, "calib/std_conf": 0.1484478938685652, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7180300957592337, "calib/step_q_c_n": 731.0, "calib/step_q_gap": 0.01617065212819413, "calib/step_q_w": 0.7018594436310396, "calib/step_q_w_n": 683.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 546.1015625, "completions/mean_terminated_length": 546.1015625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.1664, "grad_norm": 0.004691540729254484, "kl": 0.06121063232421875, "learning_rate": 1.2222222222222223e-06, "loss": 0.0518, "num_tokens": 37735493.0, "reward": 0.6621110439300537, "reward_std": 0.3094662129878998, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6900647878646851, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.3333760201931, "step": 156 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5312153796024763, "calib/avg_num_step_conf": 5.8515625, "calib/ece": 0.17783464566929125, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.06692913385826772, "calib/gap": 0.022703812316715588, "calib/mean_conf": 0.7747637795275589, "calib/mu_c": 0.7836129032258065, "calib/mu_w": 0.7609090909090909, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17118110236220463, "calib/std_conf": 0.1261847137935653, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7137927461139896, "calib/step_q_c_n": 965.0, "calib/step_q_gap": 0.0181079431121135, "calib/step_q_w": 0.6956848030018761, "calib/step_q_w_n": 533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2275.0, "completions/max_terminated_length": 2275.0, "completions/mean_length": 529.3515625, "completions/mean_terminated_length": 529.3515625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.16746666666666668, "grad_norm": 0.004464243073016405, "kl": 0.0630035400390625, "learning_rate": 1.1944444444444446e-06, "loss": 0.0011, "num_tokens": 37974735.0, "reward": 0.6527853012084961, "reward_std": 0.35439929366111755, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7229777574539185, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.26384279131889343, "step": 157 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.48933588761174973, "calib/avg_num_step_conf": 5.10546875, "calib/ece": 0.27366533864541837, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.04780876494023904, "calib/gap": -0.017674329501915942, "calib/mean_conf": 0.7723904382470119, "calib/mu_c": 0.7642222222222221, "calib/mu_w": 0.7818965517241381, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2541035856573705, "calib/std_conf": 0.13399717153837998, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7058136094674556, "calib/step_q_c_n": 676.0, "calib/step_q_gap": 0.004598078564127639, "calib/step_q_w": 0.701215530903328, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2767.0, "completions/max_terminated_length": 2767.0, "completions/mean_length": 501.10546875, "completions/mean_terminated_length": 503.07061767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.16853333333333334, "grad_norm": 0.004322138614952564, "kl": 0.0720062255859375, "learning_rate": 1.1666666666666668e-06, "loss": 0.0199, "num_tokens": 38208258.0, "reward": 0.5553004741668701, "reward_std": 0.3481801748275757, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6541483998298645, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.15567129850387573, "step": 158 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6022792385560578, "calib/avg_num_step_conf": 4.89453125, "calib/ece": 0.24646825396825386, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.10317460317460317, "calib/gap": 0.0364238874387216, "calib/mean_conf": 0.7980555555555556, "calib/mu_c": 0.8143884892086332, "calib/mu_w": 0.7779646017699116, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24646825396825386, "calib/std_conf": 0.10246074010462054, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7284534534534535, "calib/step_q_c_n": 666.0, "calib/step_q_gap": 0.011639143402346175, "calib/step_q_w": 0.7168143100511073, "calib/step_q_w_n": 587.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1939.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 492.5, "completions/mean_terminated_length": 498.3399353027344, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.1696, "grad_norm": 0.005003445781767368, "kl": 0.0658721923828125, "learning_rate": 1.138888888888889e-06, "loss": -0.0221, "num_tokens": 38439122.0, "reward": 0.657343864440918, "reward_std": 0.33716249465942383, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6885058283805847, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3207131028175354, "step": 159 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4872908622908623, "calib/avg_num_step_conf": 5.3515625, "calib/ece": 0.24545816733067732, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.055776892430278883, "calib/gap": -0.015550193050193006, "calib/mean_conf": 0.7729482071713148, "calib/mu_c": 0.7660714285714286, "calib/mu_w": 0.7816216216216216, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23031872509960163, "calib/std_conf": 0.12740465268063883, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6951212553495006, "calib/step_q_c_n": 701.0, "calib/step_q_gap": -0.010289805936000151, "calib/step_q_w": 0.7054110612855008, "calib/step_q_w_n": 669.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2762.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 552.41796875, "completions/mean_terminated_length": 552.41796875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.17066666666666666, "grad_norm": 0.004410970024764538, "kl": 0.0662689208984375, "learning_rate": 1.111111111111111e-06, "loss": 0.0435, "num_tokens": 38685381.0, "reward": 0.6169449687004089, "reward_std": 0.363558292388916, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6697894930839539, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2586316764354706, "step": 160 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6014192785334121, "calib/avg_num_step_conf": 5.08203125, "calib/ece": 0.11787401574803148, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.07480314960629922, "calib/gap": 0.027143701951507948, "calib/mean_conf": 0.7716535433070866, "calib/mu_c": 0.7797752808988763, "calib/mu_w": 0.7526315789473683, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09437007874015745, "calib/std_conf": 0.14143674348875832, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7118466522678186, "calib/step_q_c_n": 926.0, "calib/step_q_gap": 0.0029933189344852895, "calib/step_q_w": 0.7088533333333333, "calib/step_q_w_n": 375.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2132.0, "completions/max_terminated_length": 2132.0, "completions/mean_length": 498.08984375, "completions/mean_terminated_length": 498.08984375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.17173333333333332, "grad_norm": 0.005129239987581968, "kl": 0.06465911865234375, "learning_rate": 1.0833333333333335e-06, "loss": 0.0135, "num_tokens": 38916812.0, "reward": 0.7501875758171082, "reward_std": 0.2995142638683319, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7687234282493591, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3949328362941742, "step": 161 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5519480519480519, "calib/avg_num_step_conf": 5.11328125, "calib/ece": 0.14769531249999998, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.07421875, "calib/gap": 0.02688444888444874, "calib/mean_conf": 0.7823828125, "calib/mu_c": 0.7919393939393938, "calib/mu_w": 0.7650549450549451, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1427734375, "calib/std_conf": 0.11652934321272836, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7153942428035044, "calib/step_q_c_n": 799.0, "calib/step_q_gap": 0.008982478097622093, "calib/step_q_w": 0.7064117647058823, "calib/step_q_w_n": 510.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 480.0625, "completions/mean_terminated_length": 481.94512939453125, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.1728, "grad_norm": 0.004994014278054237, "kl": 0.06990814208984375, "learning_rate": 1.0555555555555557e-06, "loss": 0.0274, "num_tokens": 39143852.0, "reward": 0.7359399795532227, "reward_std": 0.3165464401245117, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7494761943817139, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.39427870512008667, "step": 162 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5603504218040234, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.31429718875502016, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.07630522088353414, "calib/gap": 0.028836469824788935, "calib/mean_conf": 0.7699598393574297, "calib/mu_c": 0.7854782608695651, "calib/mu_w": 0.7566417910447761, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.31120481927710847, "calib/std_conf": 0.13173692292223305, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7043589743589743, "calib/step_q_c_n": 546.0, "calib/step_q_gap": 0.01997980212933481, "calib/step_q_w": 0.6843791722296395, "calib/step_q_w_n": 749.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2435.0, "completions/max_terminated_length": 2435.0, "completions/mean_length": 542.75, "completions/mean_terminated_length": 547.0236206054688, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.17386666666666667, "grad_norm": 0.004544574301689863, "kl": 0.06464385986328125, "learning_rate": 1.0277777777777777e-06, "loss": -0.017, "num_tokens": 39387628.0, "reward": 0.506102442741394, "reward_std": 0.33533090353012085, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.6356328725814819, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.09219704568386078, "step": 163 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.537001287001287, "calib/avg_num_step_conf": 5.16796875, "calib/ece": 0.22402390438247025, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.05179282868525897, "calib/gap": 0.030009009009009047, "calib/mean_conf": 0.7777290836653387, "calib/mu_c": 0.791, "calib/mu_w": 0.760990990990991, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22199203187251007, "calib/std_conf": 0.11736152436447117, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6988037634408601, "calib/step_q_c_n": 744.0, "calib/step_q_gap": 0.018561967240514843, "calib/step_q_w": 0.6802417962003453, "calib/step_q_w_n": 579.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2157.0, "completions/max_terminated_length": 2157.0, "completions/mean_length": 567.21875, "completions/mean_terminated_length": 571.68505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.17493333333333333, "grad_norm": 0.004481378011405468, "kl": 0.0655364990234375, "learning_rate": 1.0000000000000002e-06, "loss": 0.0257, "num_tokens": 39638972.0, "reward": 0.6318373084068298, "reward_std": 0.324283629655838, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6921964883804321, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2660093605518341, "step": 164 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5638165662089107, "calib/avg_num_step_conf": 5.4453125, "calib/ece": 0.3218503937007874, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.07480314960629922, "calib/gap": 0.022084757347915263, "calib/mean_conf": 0.7857086614173229, "calib/mu_c": 0.7972727272727272, "calib/mu_w": 0.775187969924812, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3155905511811023, "calib/std_conf": 0.12280870536715315, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7209561128526646, "calib/step_q_c_n": 638.0, "calib/step_q_gap": 0.048046059942611685, "calib/step_q_w": 0.6729100529100529, "calib/step_q_w_n": 756.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2947.0, "completions/max_terminated_length": 2947.0, "completions/mean_length": 565.6875, "completions/mean_terminated_length": 565.6875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.176, "grad_norm": 0.0046351272612810135, "kl": 0.0653533935546875, "learning_rate": 9.722222222222224e-07, "loss": 0.0127, "num_tokens": 39889364.0, "reward": 0.5509390830993652, "reward_std": 0.3410285711288452, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.6437867283821106, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.16590389609336853, "step": 165 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5079512324410284, "calib/avg_num_step_conf": 5.33984375, "calib/ece": 0.20920634920634926, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.06746031746031746, "calib/gap": 0.008144712430426582, "calib/mean_conf": 0.7943650793650794, "calib/mu_c": 0.7975324675324675, "calib/mu_w": 0.789387755102041, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.19623015873015878, "calib/std_conf": 0.12337512825773576, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7252061855670103, "calib/step_q_c_n": 776.0, "calib/step_q_gap": 0.007625813316587293, "calib/step_q_w": 0.717580372250423, "calib/step_q_w_n": 591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2191.0, "completions/max_terminated_length": 2191.0, "completions/mean_length": 553.09765625, "completions/mean_terminated_length": 553.09765625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.17706666666666668, "grad_norm": 0.004448858089745045, "kl": 0.05725860595703125, "learning_rate": 9.444444444444445e-07, "loss": 0.0238, "num_tokens": 40137141.0, "reward": 0.663632333278656, "reward_std": 0.33324968814849854, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7015999555587769, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.3108208179473877, "step": 166 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5180018318889593, "calib/avg_num_step_conf": 5.32421875, "calib/ece": 0.15547244094488194, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.06692913385826772, "calib/gap": 0.005885295568237914, "calib/mean_conf": 0.7946850393700787, "calib/mu_c": 0.7966081871345029, "calib/mu_w": 0.790722891566265, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1384645669291339, "calib/std_conf": 0.11801716148976306, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7187671794871795, "calib/step_q_c_n": 975.0, "calib/step_q_gap": -0.005493129791171114, "calib/step_q_w": 0.7242603092783506, "calib/step_q_w_n": 388.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2118.0, "completions/max_terminated_length": 2118.0, "completions/mean_length": 536.91796875, "completions/mean_terminated_length": 539.0235595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.17813333333333334, "grad_norm": 0.004431822337210178, "kl": 0.06308746337890625, "learning_rate": 9.166666666666666e-07, "loss": -0.0022, "num_tokens": 40380200.0, "reward": 0.6855368614196777, "reward_std": 0.31911522150039673, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.744168758392334, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.29565489292144775, "step": 167 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5900554981930819, "calib/avg_num_step_conf": 5.140625, "calib/ece": 0.17351778656126482, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.07114624505928854, "calib/gap": 0.058692565823438514, "calib/mean_conf": 0.7624505928853755, "calib/mu_c": 0.7865771812080538, "calib/mu_w": 0.7278846153846152, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17351778656126482, "calib/std_conf": 0.13785884733373494, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7120529801324503, "calib/step_q_c_n": 755.0, "calib/step_q_gap": 0.056634085301790815, "calib/step_q_w": 0.6554188948306595, "calib/step_q_w_n": 561.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2766.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 560.04296875, "completions/mean_terminated_length": 562.2392578125, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.1792, "grad_norm": 0.0045144446194171906, "kl": 0.06415557861328125, "learning_rate": 8.88888888888889e-07, "loss": 0.0411, "num_tokens": 40628243.0, "reward": 0.6357038021087646, "reward_std": 0.3899627923965454, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7285742163658142, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.228770911693573, "step": 168 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.635352366255144, "calib/avg_num_step_conf": 5.19140625, "calib/ece": 0.2372619047619047, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.05952380952380952, "calib/gap": 0.038148148148148264, "calib/mean_conf": 0.7883730158730159, "calib/mu_c": 0.8047222222222222, "calib/mu_w": 0.766574074074074, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22710317460317456, "calib/std_conf": 0.13560611929156152, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7267403314917127, "calib/step_q_c_n": 724.0, "calib/step_q_gap": 0.10915355463220855, "calib/step_q_w": 0.6175867768595041, "calib/step_q_w_n": 605.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2591.0, "completions/max_terminated_length": 2591.0, "completions/mean_length": 547.890625, "completions/mean_terminated_length": 550.0392456054688, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.18026666666666666, "grad_norm": 0.004493064247071743, "kl": 0.06183624267578125, "learning_rate": 8.611111111111112e-07, "loss": 0.0265, "num_tokens": 40872687.0, "reward": 0.6389569640159607, "reward_std": 0.3134586811065674, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6957355737686157, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2735845744609833, "step": 169 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6196836788942052, "calib/avg_num_step_conf": 5.30859375, "calib/ece": 0.18948207171314746, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.07569721115537849, "calib/gap": 0.04203216374268981, "calib/mean_conf": 0.7943426294820717, "calib/mu_c": 0.8109210526315789, "calib/mu_w": 0.7688888888888891, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.18912350597609567, "calib/std_conf": 0.09476678301391761, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.702200956937799, "calib/step_q_c_n": 836.0, "calib/step_q_gap": 0.0036541118135160477, "calib/step_q_w": 0.698546845124283, "calib/step_q_w_n": 523.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1956.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 553.328125, "completions/mean_terminated_length": 553.328125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.18133333333333335, "grad_norm": 0.004648624919354916, "kl": 0.05800628662109375, "learning_rate": 8.333333333333333e-07, "loss": 0.0087, "num_tokens": 41118491.0, "reward": 0.6535503268241882, "reward_std": 0.3317633271217346, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7185636758804321, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.27447444200515747, "step": 170 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5978896516653954, "calib/avg_num_step_conf": 5.1640625, "calib/ece": 0.3296031746031746, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.07539682539682539, "calib/gap": 0.048039664378336955, "calib/mean_conf": 0.7732539682539683, "calib/mu_c": 0.7995614035087718, "calib/mu_w": 0.7515217391304349, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3252380952380952, "calib/std_conf": 0.14069740899160643, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7059758203799654, "calib/step_q_c_n": 579.0, "calib/step_q_gap": 0.048815658872562895, "calib/step_q_w": 0.6571601615074025, "calib/step_q_w_n": 743.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2710.0, "completions/max_terminated_length": 2710.0, "completions/mean_length": 537.76953125, "completions/mean_terminated_length": 537.76953125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.1824, "grad_norm": 0.004815216176211834, "kl": 0.0618438720703125, "learning_rate": 8.055555555555557e-07, "loss": 0.0305, "num_tokens": 41363056.0, "reward": 0.5328998565673828, "reward_std": 0.35678592324256897, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.6431062817573547, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.13675591349601746, "step": 171 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6530531324345757, "calib/avg_num_step_conf": 4.74609375, "calib/ece": 0.1715810276679842, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.05928853754940711, "calib/gap": 0.062312318265926314, "calib/mean_conf": 0.7850197628458498, "calib/mu_c": 0.8089102564102564, "calib/mu_w": 0.7465979381443301, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17, "calib/std_conf": 0.11636328729110729, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7164840764331211, "calib/step_q_c_n": 785.0, "calib/step_q_gap": 0.03211198340986532, "calib/step_q_w": 0.6843720930232557, "calib/step_q_w_n": 430.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2483.0, "completions/max_terminated_length": 2483.0, "completions/mean_length": 509.65234375, "completions/mean_terminated_length": 511.6510009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.18346666666666667, "grad_norm": 0.004593435674905777, "kl": 0.06084442138671875, "learning_rate": 7.777777777777779e-07, "loss": 0.0067, "num_tokens": 41596879.0, "reward": 0.69252610206604, "reward_std": 0.3409295082092285, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7423496246337891, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3231714069843292, "step": 172 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5530224075039083, "calib/avg_num_step_conf": 5.21875, "calib/ece": 0.1972727272727273, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.08300395256916997, "calib/gap": 0.023839239187076422, "calib/mean_conf": 0.7771936758893281, "calib/mu_c": 0.7867105263157894, "calib/mu_w": 0.762871287128713, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18683794466403167, "calib/std_conf": 0.143737622046652, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7323192019950124, "calib/step_q_c_n": 802.0, "calib/step_q_gap": 0.02555890236954428, "calib/step_q_w": 0.7067602996254682, "calib/step_q_w_n": 534.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2323.0, "completions/max_terminated_length": 2323.0, "completions/mean_length": 558.58984375, "completions/mean_terminated_length": 558.58984375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.18453333333333333, "grad_norm": 0.0045933169312775135, "kl": 0.0599365234375, "learning_rate": 7.5e-07, "loss": 0.0606, "num_tokens": 41843038.0, "reward": 0.6417874097824097, "reward_std": 0.3373648524284363, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7074738144874573, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.26047593355178833, "step": 173 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5402089101434684, "calib/avg_num_step_conf": 5.4140625, "calib/ece": 0.2594861660079052, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.08695652173913043, "calib/gap": 0.03405424112761135, "calib/mean_conf": 0.7881818181818182, "calib/mu_c": 0.8037956204379563, "calib/mu_w": 0.7697413793103449, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2530830039525692, "calib/std_conf": 0.1330512046868835, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6936849132176236, "calib/step_q_c_n": 749.0, "calib/step_q_gap": 0.014925101600669066, "calib/step_q_w": 0.6787598116169545, "calib/step_q_w_n": 637.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2928.0, "completions/max_terminated_length": 2928.0, "completions/mean_length": 590.55859375, "completions/mean_terminated_length": 590.55859375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.1856, "grad_norm": 0.005183553323149681, "kl": 0.058837890625, "learning_rate": 7.222222222222222e-07, "loss": 0.0599, "num_tokens": 42098453.0, "reward": 0.5652014017105103, "reward_std": 0.39359307289123535, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6796855926513672, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.14681102335453033, "step": 174 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5470371610311349, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.34661290322580646, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.07258064516129033, "calib/gap": 0.033510545698024585, "calib/mean_conf": 0.7541935483870967, "calib/mu_c": 0.7737864077669901, "calib/mu_w": 0.7402758620689656, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.34274193548387094, "calib/std_conf": 0.15626485361240294, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6955212355212355, "calib/step_q_c_n": 518.0, "calib/step_q_gap": 0.023113373113373137, "calib/step_q_w": 0.6724078624078623, "calib/step_q_w_n": 814.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3054.0, "completions/max_terminated_length": 3054.0, "completions/mean_length": 605.14453125, "completions/mean_terminated_length": 607.5177001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 95.0, "epoch": 0.18666666666666668, "grad_norm": 0.004421654157340527, "kl": 0.05449676513671875, "learning_rate": 6.944444444444446e-07, "loss": -0.0002, "num_tokens": 42359194.0, "reward": 0.48904335498809814, "reward_std": 0.3712538182735443, "rewards/accuracy_reward_step": 0.40234375, "rewards/final_brier_reward_step": 0.6143742203712463, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.0894937515258789, "step": 175 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5567669172932331, "calib/avg_num_step_conf": 5.31640625, "calib/ece": 0.25316205533596836, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.08695652173913043, "calib/gap": 0.02729385964912301, "calib/mean_conf": 0.7712648221343873, "calib/mu_c": 0.7842105263157896, "calib/mu_w": 0.7569166666666666, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2493675889328063, "calib/std_conf": 0.130116932161979, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7219310344827584, "calib/step_q_c_n": 725.0, "calib/step_q_gap": 0.020563109954456538, "calib/step_q_w": 0.7013679245283019, "calib/step_q_w_n": 636.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2310.0, "completions/max_terminated_length": 2310.0, "completions/mean_length": 541.265625, "completions/mean_terminated_length": 541.265625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.18773333333333334, "grad_norm": 0.0042267730459570885, "kl": 0.05854034423828125, "learning_rate": 6.666666666666667e-07, "loss": -0.0064, "num_tokens": 42601822.0, "reward": 0.6104744672775269, "reward_std": 0.31826311349868774, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6766769886016846, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.2434908151626587, "step": 176 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6396663928729387, "calib/avg_num_step_conf": 5.16796875, "calib/ece": 0.26234126984126993, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.06746031746031746, "calib/gap": 0.04010172490048647, "calib/mean_conf": 0.7823412698412698, "calib/mu_c": 0.8012781954887217, "calib/mu_w": 0.7611764705882352, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.25845238095238104, "calib/std_conf": 0.12524450136511792, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7173524962178517, "calib/step_q_c_n": 661.0, "calib/step_q_gap": 0.04609871978280633, "calib/step_q_w": 0.6712537764350454, "calib/step_q_w_n": 662.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2584.0, "completions/max_terminated_length": 2584.0, "completions/mean_length": 549.38671875, "completions/mean_terminated_length": 549.38671875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.1888, "grad_norm": 0.004867099225521088, "kl": 0.05817413330078125, "learning_rate": 6.388888888888889e-07, "loss": 0.0114, "num_tokens": 42846297.0, "reward": 0.6099656224250793, "reward_std": 0.3393414318561554, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6782711148262024, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2416602075099945, "step": 177 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6612476870208829, "calib/avg_num_step_conf": 5.16015625, "calib/ece": 0.17920948616600793, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.06719367588932806, "calib/gap": 0.06232421358710016, "calib/mean_conf": 0.7777075098814229, "calib/mu_c": 0.8016025641025641, "calib/mu_w": 0.739278350515464, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17015810276679846, "calib/std_conf": 0.13258855428056515, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7318061088977423, "calib/step_q_c_n": 753.0, "calib/step_q_gap": 0.036225122982249336, "calib/step_q_w": 0.695580985915493, "calib/step_q_w_n": 568.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 519.85546875, "completions/mean_terminated_length": 521.8941650390625, "completions/min_length": 0.0, "completions/min_terminated_length": 189.0, "epoch": 0.18986666666666666, "grad_norm": 0.004670095629990101, "kl": 0.059825897216796875, "learning_rate": 6.111111111111112e-07, "loss": 0.0078, "num_tokens": 43085452.0, "reward": 0.6979852914810181, "reward_std": 0.3313940167427063, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7407445311546326, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.33569473028182983, "step": 178 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6188888888888888, "calib/avg_num_step_conf": 5.171875, "calib/ece": 0.30617529880478084, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.04780876494023904, "calib/gap": 0.0348679365079364, "calib/mean_conf": 0.7941434262948208, "calib/mu_c": 0.8115079365079364, "calib/mu_w": 0.77664, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29916334661354577, "calib/std_conf": 0.10948067989163843, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7024519940915805, "calib/step_q_c_n": 677.0, "calib/step_q_gap": 0.008402535049849336, "calib/step_q_w": 0.6940494590417312, "calib/step_q_w_n": 647.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2673.0, "completions/max_terminated_length": 2673.0, "completions/mean_length": 535.08203125, "completions/mean_terminated_length": 537.180419921875, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.19093333333333334, "grad_norm": 0.004549046047031879, "kl": 0.06024932861328125, "learning_rate": 5.833333333333334e-07, "loss": 0.0058, "num_tokens": 43328697.0, "reward": 0.5557443499565125, "reward_std": 0.3748915195465088, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6570112705230713, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.15994608402252197, "step": 179 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6261399210562134, "calib/avg_num_step_conf": 5.0546875, "calib/ece": 0.152390438247012, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.055776892430278883, "calib/gap": 0.049441949094868654, "calib/mean_conf": 0.779402390438247, "calib/mu_c": 0.7977215189873418, "calib/mu_w": 0.7482795698924731, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15115537848605584, "calib/std_conf": 0.11593394118801192, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7070868014268727, "calib/step_q_c_n": 841.0, "calib/step_q_gap": 0.015033821294422434, "calib/step_q_w": 0.6920529801324503, "calib/step_q_w_n": 453.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1670.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 563.22265625, "completions/mean_terminated_length": 572.1627197265625, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.192, "grad_norm": 0.004343854729086161, "kl": 0.0619049072265625, "learning_rate": 5.555555555555555e-07, "loss": -0.0097, "num_tokens": 43576738.0, "reward": 0.6902997493743896, "reward_std": 0.3136835992336273, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7354074716567993, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.32644209265708923, "step": 180 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6090808416389811, "calib/avg_num_step_conf": 4.84375, "calib/ece": 0.28729411764705876, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.054901960784313725, "calib/gap": 0.033252122554448205, "calib/mean_conf": 0.7869803921568628, "calib/mu_c": 0.8034108527131784, "calib/mu_w": 0.7701587301587302, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2841960784313725, "calib/std_conf": 0.11060572800046516, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7113057324840765, "calib/step_q_c_n": 628.0, "calib/step_q_gap": 0.02302141875858621, "calib/step_q_w": 0.6882843137254903, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 484.0, "completions/mean_terminated_length": 484.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.19306666666666666, "grad_norm": 0.005096070468425751, "kl": 0.0666961669921875, "learning_rate": 5.277777777777779e-07, "loss": -0.0033, "num_tokens": 43806906.0, "reward": 0.578808069229126, "reward_std": 0.3632180094718933, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6692156195640564, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.18918174505233765, "step": 181 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5387970021966662, "calib/avg_num_step_conf": 5.39453125, "calib/ece": 0.22143426294820712, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.043824701195219126, "calib/gap": 0.024952836283757773, "calib/mean_conf": 0.7831075697211155, "calib/mu_c": 0.7939436619718311, "calib/mu_w": 0.7689908256880733, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21940239043824694, "calib/std_conf": 0.12184138496467316, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7168958868894602, "calib/step_q_c_n": 778.0, "calib/step_q_gap": 0.009897545264252905, "calib/step_q_w": 0.7069983416252072, "calib/step_q_w_n": 603.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2210.0, "completions/max_terminated_length": 2210.0, "completions/mean_length": 522.02734375, "completions/mean_terminated_length": 526.1378173828125, "completions/min_length": 0.0, "completions/min_terminated_length": 233.0, "epoch": 0.19413333333333332, "grad_norm": 0.0045974161475896835, "kl": 0.05859375, "learning_rate": 5.000000000000001e-07, "loss": 0.0018, "num_tokens": 44046705.0, "reward": 0.5793590545654297, "reward_std": 0.3560647964477539, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6907273530960083, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.160959392786026, "step": 182 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5923833703586163, "calib/avg_num_step_conf": 4.80859375, "calib/ece": 0.32178571428571423, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.0873015873015873, "calib/gap": 0.04948206918438591, "calib/mean_conf": 0.7780555555555555, "calib/mu_c": 0.8049565217391305, "calib/mu_w": 0.7554744525547445, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32174603174603167, "calib/std_conf": 0.14135887042175166, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7299630996309963, "calib/step_q_c_n": 542.0, "calib/step_q_gap": 0.037394159137527416, "calib/step_q_w": 0.6925689404934688, "calib/step_q_w_n": 689.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2259.0, "completions/max_terminated_length": 2259.0, "completions/mean_length": 548.87890625, "completions/mean_terminated_length": 548.87890625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.1952, "grad_norm": 0.004615063313394785, "kl": 0.05975341796875, "learning_rate": 4.7222222222222226e-07, "loss": 0.0156, "num_tokens": 44293898.0, "reward": 0.5459011793136597, "reward_std": 0.37487709522247314, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.642777681350708, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.16230589151382446, "step": 183 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5727848101265822, "calib/avg_num_step_conf": 5.0390625, "calib/ece": 0.17000000000000007, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.08300395256916997, "calib/gap": 0.0353637574950032, "calib/mean_conf": 0.7945059288537549, "calib/mu_c": 0.8077848101265822, "calib/mu_w": 0.772421052631579, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17000000000000007, "calib/std_conf": 0.10058845125616318, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7023876765083441, "calib/step_q_c_n": 779.0, "calib/step_q_gap": 0.004775152046504583, "calib/step_q_w": 0.6976125244618395, "calib/step_q_w_n": 511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2373.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 539.79296875, "completions/mean_terminated_length": 541.9098510742188, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.19626666666666667, "grad_norm": 0.0043892087414860725, "kl": 0.05843353271484375, "learning_rate": 4.444444444444445e-07, "loss": 0.0487, "num_tokens": 44537365.0, "reward": 0.6674075722694397, "reward_std": 0.39005720615386963, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7324241995811462, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.2820783853530884, "step": 184 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.541623631481099, "calib/avg_num_step_conf": 5.24609375, "calib/ece": 0.22389344262295074, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.09836065573770492, "calib/gap": 0.02085863802244703, "calib/mean_conf": 0.8017622950819672, "calib/mu_c": 0.8105673758865248, "calib/mu_w": 0.7897087378640778, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22389344262295074, "calib/std_conf": 0.10399287123098268, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7203409090909091, "calib/step_q_c_n": 704.0, "calib/step_q_gap": 0.060700846493100036, "calib/step_q_w": 0.659640062597809, "calib/step_q_w_n": 639.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2798.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 593.890625, "completions/mean_terminated_length": 596.2196655273438, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.19733333333333333, "grad_norm": 0.004370216280221939, "kl": 0.04863739013671875, "learning_rate": 4.1666666666666667e-07, "loss": -0.0136, "num_tokens": 44796321.0, "reward": 0.581261396408081, "reward_std": 0.3171704411506653, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6714941263198853, "rewards/format_reward_step": 0.94921875, "rewards/step_correlation_reward": 0.19102858006954193, "step": 185 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5160457516339869, "calib/avg_num_step_conf": 5.12890625, "calib/ece": 0.2179446640316206, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.05533596837944664, "calib/gap": 0.0024692810457518366, "calib/mean_conf": 0.778893280632411, "calib/mu_c": 0.7798692810457516, "calib/mu_w": 0.7773999999999998, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1960474308300396, "calib/std_conf": 0.15070703090545, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7165880893300248, "calib/step_q_c_n": 806.0, "calib/step_q_gap": 0.01714035757460075, "calib/step_q_w": 0.6994477317554241, "calib/step_q_w_n": 507.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2380.0, "completions/max_terminated_length": 2380.0, "completions/mean_length": 549.40234375, "completions/mean_terminated_length": 549.40234375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.1984, "grad_norm": 0.004544900264590979, "kl": 0.0554046630859375, "learning_rate": 3.8888888888888895e-07, "loss": 0.0202, "num_tokens": 45042008.0, "reward": 0.6229519844055176, "reward_std": 0.3632119297981262, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6970667839050293, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.23243087530136108, "step": 186 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5849218452154021, "calib/avg_num_step_conf": 5.4375, "calib/ece": 0.2889641434262948, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.05976095617529881, "calib/gap": 0.025349472614054958, "calib/mean_conf": 0.7901593625498008, "calib/mu_c": 0.8024806201550387, "calib/mu_w": 0.7771311475409838, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.28258964143426296, "calib/std_conf": 0.11937742927167588, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7098208955223881, "calib/step_q_c_n": 670.0, "calib/step_q_gap": 0.0552087071567372, "calib/step_q_w": 0.6546121883656509, "calib/step_q_w_n": 722.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2619.0, "completions/max_terminated_length": 2619.0, "completions/mean_length": 599.6796875, "completions/mean_terminated_length": 599.6796875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.19946666666666665, "grad_norm": 0.025320569053292274, "kl": 0.0690155029296875, "learning_rate": 3.611111111111111e-07, "loss": -0.0079, "num_tokens": 45297070.0, "reward": 0.5355815887451172, "reward_std": 0.3840063214302063, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6569265127182007, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.11814286559820175, "step": 187 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.577579905641571, "calib/avg_num_step_conf": 5.28515625, "calib/ece": 0.19904000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.068, "calib/gap": 0.03922652667951365, "calib/mean_conf": 0.78328, "calib/mu_c": 0.7991275167785235, "calib/mu_w": 0.7599009900990098, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19316000000000003, "calib/std_conf": 0.13080535768843718, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7264237516869095, "calib/step_q_c_n": 741.0, "calib/step_q_gap": 0.029773424889523836, "calib/step_q_w": 0.6966503267973857, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2802.0, "completions/max_terminated_length": 2802.0, "completions/mean_length": 579.109375, "completions/mean_terminated_length": 583.6693115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.20053333333333334, "grad_norm": 0.004090098664164543, "kl": 0.050445556640625, "learning_rate": 3.3333333333333335e-07, "loss": -0.0056, "num_tokens": 45549394.0, "reward": 0.640600860118866, "reward_std": 0.3480183482170105, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7073789238929749, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.26288533210754395, "step": 188 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5781945130993573, "calib/avg_num_step_conf": 5.00390625, "calib/ece": 0.24784313725490187, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.058823529411764705, "calib/gap": 0.039180672268907646, "calib/mean_conf": 0.772156862745098, "calib/mu_c": 0.7904411764705882, "calib/mu_w": 0.7512605042016806, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24333333333333323, "calib/std_conf": 0.14273125336868883, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7104017216642755, "calib/step_q_c_n": 697.0, "calib/step_q_gap": 0.03707980385605625, "calib/step_q_w": 0.6733219178082193, "calib/step_q_w_n": 584.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2940.0, "completions/max_terminated_length": 2940.0, "completions/mean_length": 546.1328125, "completions/mean_terminated_length": 546.1328125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.2016, "grad_norm": 0.004661476239562035, "kl": 0.06034088134765625, "learning_rate": 3.055555555555556e-07, "loss": 0.0078, "num_tokens": 45796972.0, "reward": 0.5992229580879211, "reward_std": 0.3520023822784424, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6865929961204529, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.20716550946235657, "step": 189 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6374180378572313, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.24294117647058813, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0784313725490196, "calib/gap": 0.0629871334900407, "calib/mean_conf": 0.7801960784313725, "calib/mu_c": 0.8093430656934306, "calib/mu_w": 0.7463559322033899, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24294117647058813, "calib/std_conf": 0.1258896216825136, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7085185185185185, "calib/step_q_c_n": 729.0, "calib/step_q_gap": 0.01792150359314537, "calib/step_q_w": 0.6905970149253732, "calib/step_q_w_n": 603.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 593.7578125, "completions/mean_terminated_length": 596.0863037109375, "completions/min_length": 0.0, "completions/min_terminated_length": 216.0, "epoch": 0.20266666666666666, "grad_norm": 0.0045477100647985935, "kl": 0.05255889892578125, "learning_rate": 2.7777777777777776e-07, "loss": -0.0021, "num_tokens": 46054582.0, "reward": 0.6030627489089966, "reward_std": 0.3603215515613556, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7050730586051941, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.19480247795581818, "step": 190 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6133603238866397, "calib/avg_num_step_conf": 5.4375, "calib/ece": 0.33456, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.096, "calib/gap": 0.04238673607094667, "calib/mean_conf": 0.80104, "calib/mu_c": 0.8235897435897437, "calib/mu_w": 0.781203007518797, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.33380000000000004, "calib/std_conf": 0.11470186746518123, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7104006410256409, "calib/step_q_c_n": 624.0, "calib/step_q_gap": 0.0047495993589742325, "calib/step_q_w": 0.7056510416666667, "calib/step_q_w_n": 768.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2482.0, "completions/max_terminated_length": 2482.0, "completions/mean_length": 540.63671875, "completions/mean_terminated_length": 540.63671875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.20373333333333332, "grad_norm": 0.004615450277924538, "kl": 0.059356689453125, "learning_rate": 2.5000000000000004e-07, "loss": 0.0366, "num_tokens": 46297153.0, "reward": 0.524100661277771, "reward_std": 0.2861478328704834, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.6304647922515869, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.13101768493652344, "step": 191 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6018273471959673, "calib/avg_num_step_conf": 5.09375, "calib/ece": 0.2530434782608696, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.06324110671936758, "calib/gap": 0.03463768115942012, "calib/mean_conf": 0.7780237154150197, "calib/mu_c": 0.7937681159420289, "calib/mu_w": 0.7591304347826088, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24280632411067196, "calib/std_conf": 0.12691866396896104, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7158591549295774, "calib/step_q_c_n": 710.0, "calib/step_q_gap": 0.06244164651206896, "calib/step_q_w": 0.6534175084175085, "calib/step_q_w_n": 594.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 559.578125, "completions/mean_terminated_length": 559.578125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.2048, "grad_norm": 0.004527461715042591, "kl": 0.06067657470703125, "learning_rate": 2.2222222222222224e-07, "loss": 0.0512, "num_tokens": 46545381.0, "reward": 0.6329571604728699, "reward_std": 0.3509492874145508, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6870129108428955, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.27421388030052185, "step": 192 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.62677842385253, "calib/avg_num_step_conf": 5.0078125, "calib/ece": 0.26494117647058824, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.06666666666666667, "calib/gap": 0.04801002103179497, "calib/mean_conf": 0.7943529411764706, "calib/mu_c": 0.8165693430656935, "calib/mu_w": 0.7685593220338985, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26101960784313727, "calib/std_conf": 0.1261817586533001, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7286369958275383, "calib/step_q_c_n": 719.0, "calib/step_q_gap": 0.030484242719190036, "calib/step_q_w": 0.6981527531083482, "calib/step_q_w_n": 563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3026.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 548.2421875, "completions/mean_terminated_length": 548.2421875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.20586666666666667, "grad_norm": 0.004557074513286352, "kl": 0.05731964111328125, "learning_rate": 1.9444444444444447e-07, "loss": 0.0423, "num_tokens": 46791443.0, "reward": 0.5993220806121826, "reward_std": 0.38422849774360657, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6905304789543152, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.20186357200145721, "step": 193 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5816294132588266, "calib/avg_num_step_conf": 4.99609375, "calib/ece": 0.29075697211155377, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.07171314741035857, "calib/gap": 0.03610744221488449, "calib/mean_conf": 0.7697211155378486, "calib/mu_c": 0.7875590551181102, "calib/mu_w": 0.7514516129032257, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2772509960159362, "calib/std_conf": 0.1634693640424067, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.708179012345679, "calib/step_q_c_n": 648.0, "calib/step_q_gap": 0.0453580931697678, "calib/step_q_w": 0.6628209191759112, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 519.83984375, "completions/mean_terminated_length": 523.9330444335938, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.20693333333333333, "grad_norm": 0.004988906439393759, "kl": 0.055816650390625, "learning_rate": 1.6666666666666668e-07, "loss": 0.0221, "num_tokens": 47030466.0, "reward": 0.554186999797821, "reward_std": 0.4383990168571472, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6530015468597412, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.16162237524986267, "step": 194 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5845663265306122, "calib/avg_num_step_conf": 5.17578125, "calib/ece": 0.22968253968253974, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.07936507936507936, "calib/gap": 0.036374999999999824, "calib/mean_conf": 0.779047619047619, "calib/mu_c": 0.7952142857142857, "calib/mu_w": 0.7588392857142858, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22658730158730164, "calib/std_conf": 0.12304295656801217, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7152106741573033, "calib/step_q_c_n": 712.0, "calib/step_q_gap": 0.051426008578184135, "calib/step_q_w": 0.6637846655791192, "calib/step_q_w_n": 613.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 551.60546875, "completions/mean_terminated_length": 551.60546875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.208, "grad_norm": 0.004569363314658403, "kl": 0.0572357177734375, "learning_rate": 1.3888888888888888e-07, "loss": 0.0088, "num_tokens": 47277661.0, "reward": 0.5922529697418213, "reward_std": 0.3348928391933441, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6949304938316345, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.1833253800868988, "step": 195 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5556579783852511, "calib/avg_num_step_conf": 4.85546875, "calib/ece": 0.23015810276679843, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.07905138339920949, "calib/gap": 0.03444055944055946, "calib/mean_conf": 0.7953754940711463, "calib/mu_c": 0.8103496503496503, "calib/mu_w": 0.7759090909090909, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23015810276679843, "calib/std_conf": 0.11368928824264636, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7309170305676855, "calib/step_q_c_n": 687.0, "calib/step_q_gap": 0.039449404668404964, "calib/step_q_w": 0.6914676258992806, "calib/step_q_w_n": 556.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2226.0, "completions/max_terminated_length": 2226.0, "completions/mean_length": 487.3359375, "completions/mean_terminated_length": 487.3359375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.20906666666666668, "grad_norm": 0.005070297047495842, "kl": 0.062015533447265625, "learning_rate": 1.1111111111111112e-07, "loss": 0.0075, "num_tokens": 47504963.0, "reward": 0.6088164448738098, "reward_std": 0.3506855368614197, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6970175504684448, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.21124020218849182, "step": 196 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6139024704065877, "calib/avg_num_step_conf": 5.296875, "calib/ece": 0.2788400000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.1, "calib/gap": 0.03703937210499231, "calib/mean_conf": 0.8035599999999999, "calib/mu_c": 0.8207462686567165, "calib/mu_w": 0.7837068965517242, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.27320000000000005, "calib/std_conf": 0.11233755560808682, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7165542521994136, "calib/step_q_c_n": 682.0, "calib/step_q_gap": 0.0035127091727072957, "calib/step_q_w": 0.7130415430267063, "calib/step_q_w_n": 674.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2342.0, "completions/max_terminated_length": 2342.0, "completions/mean_length": 540.3359375, "completions/mean_terminated_length": 546.7431030273438, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.21013333333333334, "grad_norm": 0.004491974599659443, "kl": 0.057811737060546875, "learning_rate": 8.333333333333334e-08, "loss": 0.0028, "num_tokens": 47748345.0, "reward": 0.5784967541694641, "reward_std": 0.3455749452114105, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6694449186325073, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.18754860758781433, "step": 197 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6222222222222222, "calib/avg_num_step_conf": 4.88671875, "calib/ece": 0.2284523809523809, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.06746031746031746, "calib/gap": 0.019258823529411884, "calib/mean_conf": 0.794404761904762, "calib/mu_c": 0.8022, "calib/mu_w": 0.7829411764705881, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21380952380952375, "calib/std_conf": 0.12721701734097443, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7308748317631224, "calib/step_q_c_n": 743.0, "calib/step_q_gap": 0.025776406566271892, "calib/step_q_w": 0.7050984251968505, "calib/step_q_w_n": 508.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2599.0, "completions/max_terminated_length": 2599.0, "completions/mean_length": 496.71484375, "completions/mean_terminated_length": 498.66278076171875, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.2112, "grad_norm": 0.004678445868194103, "kl": 0.06182861328125, "learning_rate": 5.555555555555556e-08, "loss": 0.0301, "num_tokens": 47980888.0, "reward": 0.6651490926742554, "reward_std": 0.322966992855072, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7013660073280334, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3148697316646576, "step": 198 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.59375, "calib/avg_num_step_conf": 4.97265625, "calib/ece": 0.22723320158102772, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.04743083003952569, "calib/gap": 0.02434378185524977, "calib/mean_conf": 0.7779841897233201, "calib/mu_c": 0.7884722222222221, "calib/mu_w": 0.7641284403669724, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2180237154150198, "calib/std_conf": 0.13423876929138315, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7115448275862067, "calib/step_q_c_n": 725.0, "calib/step_q_gap": 0.022639718097155703, "calib/step_q_w": 0.688905109489051, "calib/step_q_w_n": 548.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2676.0, "completions/max_terminated_length": 2676.0, "completions/mean_length": 556.4921875, "completions/mean_terminated_length": 556.4921875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.21226666666666666, "grad_norm": 0.004453481640666723, "kl": 0.059539794921875, "learning_rate": 2.777777777777778e-08, "loss": 0.0177, "num_tokens": 48227550.0, "reward": 0.62669438123703, "reward_std": 0.36970627307891846, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6939073801040649, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.2501063346862793, "step": 199 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5132642068408295, "calib/avg_num_step_conf": 5.01171875, "calib/ece": 0.16654761904761894, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.051587301587301584, "calib/gap": 0.01234715863183411, "calib/mean_conf": 0.7821031746031746, "calib/mu_c": 0.7867088607594938, "calib/mu_w": 0.7743617021276596, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16083333333333322, "calib/std_conf": 0.1195292571852399, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7197509829619922, "calib/step_q_c_n": 763.0, "calib/step_q_gap": 0.041674059885069115, "calib/step_q_w": 0.6780769230769231, "calib/step_q_w_n": 520.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2989.0, "completions/max_terminated_length": 2989.0, "completions/mean_length": 550.1484375, "completions/mean_terminated_length": 552.305908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.21333333333333335, "grad_norm": 0.004446312319487333, "kl": 0.05374908447265625, "learning_rate": 0.0, "loss": 0.0488, "num_tokens": 48476436.0, "reward": 0.671890377998352, "reward_std": 0.292317271232605, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.722089409828186, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3013787865638733, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 0.01753797926590778, "train_runtime": 12763.8849, "train_samples_per_second": 4.011, "train_steps_per_second": 0.016 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 48476436, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }