{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.47714588046073914, "adv/mean_abs_step_conf": 0.7603062391281128, "adv/ratio_final_to_reasoning": 1.622059942565935, "adv/ratio_step_to_reasoning": 1.5934460932450047, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.7393431663513184, "adv/std_step_conf": 0.9350208044052124, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.04540952667593956, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": 0.0801, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 0.3269073963165283, "reward_std": 0.18100249767303467, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": -0.3909340500831604, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.5104547739028931, "adv/mean_abs_step_conf": 0.7713351249694824, "adv/ratio_final_to_reasoning": 1.503115479781084, "adv/ratio_step_to_reasoning": 1.5110743681990095, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.7575037479400635, "adv/std_step_conf": 0.9351165890693665, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.03894897922873497, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0068, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 0.25607970356941223, "reward_std": 0.1939290463924408, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l1_reward": -0.4261667728424072, "step": 2 }, { "adv/mean_abs_final_conf": 0.7773253917694092, "adv/mean_abs_reasoning": 0.4774738848209381, "adv/mean_abs_step_conf": 0.7686185836791992, "adv/ratio_final_to_reasoning": 1.6279956171025378, "adv/ratio_step_to_reasoning": 1.609760466726774, "adv/std_final_conf": 0.9283453822135925, "adv/std_reasoning": 0.7393098473548889, "adv/std_step_conf": 0.93489009141922, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.583791928721174, "calib/avg_num_step_conf": 5.0078125, "calib/ece": 0.2604313725490196, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3215686274509804, "calib/gap": 0.014355345911949535, "calib/mean_conf": 0.8814509803921569, "calib/mu_c": 0.8868553459119496, "calib/mu_w": 0.8725, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2591764705882353, "calib/std_conf": 0.046541701193867246, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7877590673575129, "calib/step_q_c_n": 772.0, "calib/step_q_gap": 0.018406126181042315, "calib/step_q_w": 0.7693529411764706, "calib/step_q_w_n": 510.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2468.0, "completions/max_terminated_length": 2468.0, "completions/mean_length": 505.6015625, "completions/mean_terminated_length": 505.6015625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.0032, "grad_norm": 0.060042351484298706, "kl": 0.0027255117893218994, "learning_rate": 7.5e-07, "loss": 0.0702, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03299238905310631, "mask/share_reasoning": 0.8572773933410645, "mask/share_step_conf": 0.10973025858402252, "num_tokens": 693351.0, "reward": 0.3012976050376892, "reward_std": 0.18565943837165833, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6995797157287598, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": -0.41885948181152344, "step": 3 }, { "adv/mean_abs_final_conf": 0.762697160243988, "adv/mean_abs_reasoning": 0.3705032169818878, "adv/mean_abs_step_conf": 0.7731007933616638, "adv/ratio_final_to_reasoning": 2.0585439620657136, "adv/ratio_step_to_reasoning": 2.0866236996788534, "adv/std_final_conf": 0.9276609420776367, "adv/std_reasoning": 0.6612005233764648, "adv/std_step_conf": 0.9348263740539551, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5453267973856208, "calib/avg_num_step_conf": 5.1640625, "calib/ece": 0.27134387351778666, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.25296442687747034, "calib/gap": 0.008577124183006268, "calib/mean_conf": 0.8760869565217392, "calib/mu_c": 0.8794771241830065, "calib/mu_w": 0.8709000000000002, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27134387351778666, "calib/std_conf": 0.04777359677934313, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7998416886543536, "calib/step_q_c_n": 758.0, "calib/step_q_gap": 0.032412610640169204, "calib/step_q_w": 0.7674290780141844, "calib/step_q_w_n": 564.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2156.0, "completions/max_terminated_length": 2156.0, "completions/mean_length": 505.703125, "completions/mean_terminated_length": 507.6863098144531, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.004266666666666667, "grad_norm": 0.04558708891272545, "kl": 0.0002855062484741211, "learning_rate": 1.0000000000000002e-06, "loss": 0.0077, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03333834931254387, "mask/share_reasoning": 0.847872793674469, "mask/share_step_conf": 0.11488261073827744, "num_tokens": 928979.0, "reward": 0.28353989124298096, "reward_std": 0.15577252209186554, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6810855865478516, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": -0.4311932921409607, "step": 4 }, { "adv/mean_abs_final_conf": 0.7652193307876587, "adv/mean_abs_reasoning": 0.4220971465110779, "adv/mean_abs_step_conf": 0.7900995016098022, "adv/ratio_final_to_reasoning": 1.8128986114043195, "adv/ratio_step_to_reasoning": 1.8718427929222359, "adv/std_final_conf": 0.9303828477859497, "adv/std_reasoning": 0.7013581991195679, "adv/std_step_conf": 0.9348316192626953, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.40712190463196674, "calib/avg_num_step_conf": 4.57421875, "calib/ece": 0.3155327868852459, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.2827868852459016, "calib/gap": -0.014150351320008192, "calib/mean_conf": 0.8770081967213115, "calib/mu_c": 0.8708029197080291, "calib/mu_w": 0.8849532710280373, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.3155327868852459, "calib/std_conf": 0.04733539137977054, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.7995927601809957, "calib/step_q_c_n": 663.0, "calib/step_q_gap": 0.013096697188869522, "calib/step_q_w": 0.7864960629921262, "calib/step_q_w_n": 508.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 518.48046875, "completions/mean_terminated_length": 524.6284790039062, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.005333333333333333, "grad_norm": 0.04099205508828163, "kl": 0.0003192722797393799, "learning_rate": 1.25e-06, "loss": 0.0557, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03335411101579666, "mask/share_reasoning": 0.8477334976196289, "mask/share_step_conf": 0.10719365626573563, "num_tokens": 1168398.0, "reward": 0.24281755089759827, "reward_std": 0.16069456934928894, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6147746443748474, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": -0.42679572105407715, "step": 5 }, { "adv/mean_abs_final_conf": 0.781194806098938, "adv/mean_abs_reasoning": 0.3777908384799957, "adv/mean_abs_step_conf": 0.7725051641464233, "adv/ratio_final_to_reasoning": 2.0677971155732586, "adv/ratio_step_to_reasoning": 2.04479591737725, "adv/std_final_conf": 0.9307109117507935, "adv/std_reasoning": 0.6403254866600037, "adv/std_step_conf": 0.9347091317176819, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5274223602484474, "calib/avg_num_step_conf": 5.0234375, "calib/ece": 0.3353725490196079, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2784313725490196, "calib/gap": 0.003583850931676902, "calib/mean_conf": 0.8810980392156863, "calib/mu_c": 0.8827142857142857, "calib/mu_w": 0.8791304347826088, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3337254901960785, "calib/std_conf": 0.04119263921584887, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7980880121396056, "calib/step_q_c_n": 659.0, "calib/step_q_gap": 0.007609543240084005, "calib/step_q_w": 0.7904784688995216, "calib/step_q_w_n": 627.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 427.6875, "completions/mean_terminated_length": 429.36474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.0064, "grad_norm": 1.3526962995529175, "kl": 0.9144148826599121, "learning_rate": 1.5e-06, "loss": 0.0894, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.037748031318187714, "mask/share_reasoning": 0.8306044340133667, "mask/share_step_conf": 0.12774130702018738, "num_tokens": 1383838.0, "reward": 0.24301785230636597, "reward_std": 0.16260243952274323, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6358148455619812, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": -0.4575916826725006, "step": 6 }, { "adv/mean_abs_final_conf": 0.7746590375900269, "adv/mean_abs_reasoning": 0.45679908990859985, "adv/mean_abs_step_conf": 0.7530906200408936, "adv/ratio_final_to_reasoning": 1.6958419022792428, "adv/ratio_step_to_reasoning": 1.6486254825761106, "adv/std_final_conf": 0.9307973980903625, "adv/std_reasoning": 0.7205731272697449, "adv/std_step_conf": 0.9347212910652161, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4665942769545222, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.2958823529411765, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3058823529411765, "calib/gap": -0.009091722023505722, "calib/mean_conf": 0.8838039215686274, "calib/mu_c": 0.8801315789473683, "calib/mu_w": 0.889223300970874, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.29180392156862744, "calib/std_conf": 0.044914306034737804, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7973768472906403, "calib/step_q_c_n": 812.0, "calib/step_q_gap": 0.003001847290640236, "calib/step_q_w": 0.794375, "calib/step_q_w_n": 608.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2503.0, "completions/max_terminated_length": 2503.0, "completions/mean_length": 550.87890625, "completions/mean_terminated_length": 553.0392456054688, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.007466666666666667, "grad_norm": 0.03561462089419365, "kl": 0.0003936886787414551, "learning_rate": 1.75e-06, "loss": 0.1121, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.030144359916448593, "mask/share_reasoning": 0.8576929569244385, "mask/share_step_conf": 0.10825636237859726, "num_tokens": 1632287.0, "reward": 0.2793276011943817, "reward_std": 0.17717917263507843, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6590094566345215, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": -0.41597920656204224, "step": 7 }, { "adv/mean_abs_final_conf": 0.752838671207428, "adv/mean_abs_reasoning": 0.4558185935020447, "adv/mean_abs_step_conf": 0.7695530652999878, "adv/ratio_final_to_reasoning": 1.6516190474446957, "adv/ratio_step_to_reasoning": 1.6882880081471179, "adv/std_final_conf": 0.9314465522766113, "adv/std_reasoning": 0.7391924858093262, "adv/std_step_conf": 0.9353874325752258, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.44841897233201583, "calib/avg_num_step_conf": 4.8828125, "calib/ece": 0.3188306451612902, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.2661290322580645, "calib/gap": -0.0006363636363636571, "calib/mean_conf": 0.8752822580645161, "calib/mu_c": 0.875, "calib/mu_w": 0.8756363636363637, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3188306451612902, "calib/std_conf": 0.062132151108365445, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8010367892976589, "calib/step_q_c_n": 598.0, "calib/step_q_gap": 0.04812267886821109, "calib/step_q_w": 0.7529141104294478, "calib/step_q_w_n": 652.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2669.0, "completions/max_terminated_length": 2669.0, "completions/mean_length": 549.87890625, "completions/mean_terminated_length": 552.0353393554688, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.008533333333333334, "grad_norm": 0.041965585201978683, "kl": 0.00045447051525115967, "learning_rate": 2.0000000000000003e-06, "loss": 0.068, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03198568522930145, "mask/share_reasoning": 0.8615812063217163, "mask/share_step_conf": 0.10252687335014343, "num_tokens": 1879568.0, "reward": 0.2684253752231598, "reward_std": 0.17151033878326416, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.627129316329956, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": -0.3918410539627075, "step": 8 }, { "adv/mean_abs_final_conf": 0.8019947409629822, "adv/mean_abs_reasoning": 0.4327741265296936, "adv/mean_abs_step_conf": 0.7716740369796753, "adv/ratio_final_to_reasoning": 1.8531485405423274, "adv/ratio_step_to_reasoning": 1.7830872727247686, "adv/std_final_conf": 0.9300606846809387, "adv/std_reasoning": 0.6817935109138489, "adv/std_step_conf": 0.935168981552124, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4441333333333333, "calib/avg_num_step_conf": 4.87890625, "calib/ece": 0.2805599999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.24, "calib/gap": -0.004000000000000226, "calib/mean_conf": 0.8767999999999999, "calib/mu_c": 0.8752, "calib/mu_w": 0.8792000000000002, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.27867999999999993, "calib/std_conf": 0.04814311996536993, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7803221083455345, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.07846698467062285, "calib/step_q_w": 0.7018551236749117, "calib/step_q_w_n": 566.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2870.0, "completions/max_terminated_length": 2870.0, "completions/mean_length": 475.75, "completions/mean_terminated_length": 481.3913269042969, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.0096, "grad_norm": 0.03932953625917435, "kl": 0.00033918023109436035, "learning_rate": 2.25e-06, "loss": 0.0134, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0339675210416317, "mask/share_reasoning": 0.8485729694366455, "mask/share_step_conf": 0.10574081540107727, "num_tokens": 2108896.0, "reward": 0.2592368423938751, "reward_std": 0.19198694825172424, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.651790976524353, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": -0.44347357749938965, "step": 9 }, { "adv/mean_abs_final_conf": 0.7341861128807068, "adv/mean_abs_reasoning": 0.43882063031196594, "adv/mean_abs_step_conf": 0.7601568698883057, "adv/ratio_final_to_reasoning": 1.6730893266316123, "adv/ratio_step_to_reasoning": 1.732272407858071, "adv/std_final_conf": 0.9312600493431091, "adv/std_reasoning": 0.7391703724861145, "adv/std_step_conf": 0.9348811507225037, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4770261437908497, "calib/avg_num_step_conf": 4.9921875, "calib/ece": 0.285952380952381, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3055555555555556, "calib/gap": 0.004470588235293782, "calib/mean_conf": 0.8811904761904763, "calib/mu_c": 0.8829999999999998, "calib/mu_w": 0.878529411764706, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.285952380952381, "calib/std_conf": 0.07005788018462818, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7861420612813369, "calib/step_q_c_n": 718.0, "calib/step_q_gap": 0.003445632709908275, "calib/step_q_w": 0.7826964285714286, "calib/step_q_w_n": 560.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2653.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 534.4375, "completions/mean_terminated_length": 534.4375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.010666666666666666, "grad_norm": 0.2811925411224365, "kl": 0.48865118622779846, "learning_rate": 2.5e-06, "loss": 0.1109, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03223879635334015, "mask/share_reasoning": 0.8597942590713501, "mask/share_step_conf": 0.10796696692705154, "num_tokens": 2352512.0, "reward": 0.2733853757381439, "reward_std": 0.18411020934581757, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6640077829360962, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": -0.4312995672225952, "step": 10 }, { "adv/mean_abs_final_conf": 0.7714605331420898, "adv/mean_abs_reasoning": 0.3798549175262451, "adv/mean_abs_step_conf": 0.7588397264480591, "adv/ratio_final_to_reasoning": 2.0309347004538587, "adv/ratio_step_to_reasoning": 1.9977093659597784, "adv/std_final_conf": 0.9286066293716431, "adv/std_reasoning": 0.6613515615463257, "adv/std_step_conf": 0.9348770380020142, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.3845270890725436, "calib/avg_num_step_conf": 5.2890625, "calib/ece": 0.30011857707509887, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.31620553359683795, "calib/gap": -0.03282106782106786, "calib/mean_conf": 0.8771936758893281, "calib/mu_c": 0.8643506493506494, "calib/mu_w": 0.8971717171717173, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.284308300395257, "calib/std_conf": 0.0929895975711709, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7925751633986927, "calib/step_q_c_n": 765.0, "calib/step_q_gap": -0.0025861269238878215, "calib/step_q_w": 0.7951612903225805, "calib/step_q_w_n": 589.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1925.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 509.4140625, "completions/mean_terminated_length": 511.41180419921875, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.011733333333333333, "grad_norm": 0.04469301551580429, "kl": 0.00034499168395996094, "learning_rate": 2.7500000000000004e-06, "loss": 0.1212, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0329505056142807, "mask/share_reasoning": 0.8455782532691956, "mask/share_step_conf": 0.11756500601768494, "num_tokens": 2587402.0, "reward": 0.27336350083351135, "reward_std": 0.1707531213760376, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6576433777809143, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": -0.4288851022720337, "step": 11 }, { "adv/mean_abs_final_conf": 0.7525328993797302, "adv/mean_abs_reasoning": 0.4725276231765747, "adv/mean_abs_step_conf": 0.7634186148643494, "adv/ratio_final_to_reasoning": 1.5925691165329456, "adv/ratio_step_to_reasoning": 1.6156063210278697, "adv/std_final_conf": 0.9256395697593689, "adv/std_reasoning": 0.7393105626106262, "adv/std_step_conf": 0.9346576929092407, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4485477770641547, "calib/avg_num_step_conf": 5.68359375, "calib/ece": 0.2087795275590551, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3700787401574803, "calib/gap": 0.000744308856062248, "calib/mean_conf": 0.8833464566929135, "calib/mu_c": 0.8835838150289018, "calib/mu_w": 0.8828395061728396, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.205511811023622, "calib/std_conf": 0.0719102544298752, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8022555555555555, "calib/step_q_c_n": 900.0, "calib/step_q_gap": 0.04533663663663656, "calib/step_q_w": 0.7569189189189189, "calib/step_q_w_n": 555.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 470.015625, "completions/mean_terminated_length": 471.8588562011719, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.0128, "grad_norm": 0.03708720952272415, "kl": 0.0020183920860290527, "learning_rate": 3e-06, "loss": 0.0311, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03654616326093674, "mask/share_reasoning": 0.8310366868972778, "mask/share_step_conf": 0.12851089239120483, "num_tokens": 2811902.0, "reward": 0.34594082832336426, "reward_std": 0.18064436316490173, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7299535274505615, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": -0.370103120803833, "step": 12 }, { "adv/mean_abs_final_conf": 0.7555368542671204, "adv/mean_abs_reasoning": 0.5390787720680237, "adv/mean_abs_step_conf": 0.7594738602638245, "adv/ratio_final_to_reasoning": 1.4015333072172669, "adv/ratio_step_to_reasoning": 1.4088365181777, "adv/std_final_conf": 0.9309623837471008, "adv/std_reasoning": 0.7927247881889343, "adv/std_step_conf": 0.9354305267333984, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.582262534643487, "calib/avg_num_step_conf": 4.875, "calib/ece": 0.3088235294117647, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3607843137254902, "calib/gap": 0.01705404383975806, "calib/mean_conf": 0.8852941176470588, "calib/mu_c": 0.8925170068027212, "calib/mu_w": 0.8754629629629631, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3088235294117647, "calib/std_conf": 0.04986070562474947, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7898414985590779, "calib/step_q_c_n": 694.0, "calib/step_q_gap": -0.0033714978308138077, "calib/step_q_w": 0.7932129963898917, "calib/step_q_w_n": 554.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2430.0, "completions/max_terminated_length": 2430.0, "completions/mean_length": 478.98828125, "completions/mean_terminated_length": 478.98828125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.013866666666666666, "grad_norm": 0.03337820991873741, "kl": 0.0012568831443786621, "learning_rate": 3.2500000000000002e-06, "loss": 0.0371, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03469127416610718, "mask/share_reasoning": 0.8514494895935059, "mask/share_step_conf": 0.11385929584503174, "num_tokens": 3039115.0, "reward": 0.2829028069972992, "reward_std": 0.19942107796669006, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6625644564628601, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": -0.41004008054733276, "step": 13 }, { "adv/mean_abs_final_conf": 0.7825286388397217, "adv/mean_abs_reasoning": 0.5312036275863647, "adv/mean_abs_step_conf": 0.7674595713615417, "adv/ratio_final_to_reasoning": 1.4731236727341357, "adv/ratio_step_to_reasoning": 1.4447558930436064, "adv/std_final_conf": 0.9330103993415833, "adv/std_reasoning": 0.7576258182525635, "adv/std_step_conf": 0.9351935982704163, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.42071611253196933, "calib/avg_num_step_conf": 5.21875, "calib/ece": 0.36322709163346617, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.42231075697211157, "calib/gap": -0.013297314578005093, "calib/mean_conf": 0.8955776892430279, "calib/mu_c": 0.8894852941176471, "calib/mu_w": 0.9027826086956522, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.35848605577689246, "calib/std_conf": 0.045655527938178404, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7928291316526611, "calib/step_q_c_n": 714.0, "calib/step_q_gap": 0.019436848694461606, "calib/step_q_w": 0.7733922829581995, "calib/step_q_w_n": 622.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2482.0, "completions/max_terminated_length": 2482.0, "completions/mean_length": 515.640625, "completions/mean_terminated_length": 523.825439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.014933333333333333, "grad_norm": 0.04907077178359032, "kl": 0.005765676498413086, "learning_rate": 3.5e-06, "loss": 0.0194, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03275441378355026, "mask/share_reasoning": 0.8358793258666992, "mask/share_step_conf": 0.1157413199543953, "num_tokens": 3276519.0, "reward": 0.2413352131843567, "reward_std": 0.19809089601039886, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6020089983940125, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": -0.4209010601043701, "step": 14 }, { "adv/mean_abs_final_conf": 0.7614812850952148, "adv/mean_abs_reasoning": 0.4563322365283966, "adv/mean_abs_step_conf": 0.77045738697052, "adv/ratio_final_to_reasoning": 1.6686993031399162, "adv/ratio_step_to_reasoning": 1.6883694056590632, "adv/std_final_conf": 0.927528977394104, "adv/std_reasoning": 0.7205897569656372, "adv/std_step_conf": 0.9347683787345886, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.48810442386831276, "calib/avg_num_step_conf": 5.07421875, "calib/ece": 0.33043650793650803, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5198412698412699, "calib/gap": -0.0006249999999999867, "calib/mean_conf": 0.9018650793650793, "calib/mu_c": 0.9015972222222222, "calib/mu_w": 0.9022222222222221, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33043650793650803, "calib/std_conf": 0.04355362561668983, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7819583333333333, "calib/step_q_c_n": 720.0, "calib/step_q_gap": -0.0040347582037997265, "calib/step_q_w": 0.785993091537133, "calib/step_q_w_n": 579.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1891.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 455.0, "completions/mean_terminated_length": 456.7843322753906, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.016, "grad_norm": 0.04268636927008629, "kl": 0.001432657241821289, "learning_rate": 3.7500000000000005e-06, "loss": -0.0248, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035845156759023666, "mask/share_reasoning": 0.8433025479316711, "mask/share_step_conf": 0.11694604158401489, "num_tokens": 3500879.0, "reward": 0.2575795650482178, "reward_std": 0.18040362000465393, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6336527466773987, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": -0.42786863446235657, "step": 15 }, { "adv/mean_abs_final_conf": 0.7424015998840332, "adv/mean_abs_reasoning": 0.4367170035839081, "adv/mean_abs_step_conf": 0.7670466899871826, "adv/ratio_final_to_reasoning": 1.6999603720293268, "adv/ratio_step_to_reasoning": 1.7563930043768197, "adv/std_final_conf": 0.9280498623847961, "adv/std_reasoning": 0.7014251351356506, "adv/std_step_conf": 0.935114324092865, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4426333333333333, "calib/avg_num_step_conf": 6.36328125, "calib/ece": 0.31379999999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.676, "calib/gap": -0.0008333333333332416, "calib/mean_conf": 0.9138000000000001, "calib/mu_c": 0.9134666666666669, "calib/mu_w": 0.9143000000000001, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31379999999999997, "calib/std_conf": 0.048616458118624806, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.780503355704698, "calib/step_q_c_n": 894.0, "calib/step_q_gap": 0.041033967949595906, "calib/step_q_w": 0.7394693877551021, "calib/step_q_w_n": 735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3040.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 652.76171875, "completions/mean_terminated_length": 652.76171875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.017066666666666667, "grad_norm": 0.05110244080424309, "kl": 0.0014814138412475586, "learning_rate": 4.000000000000001e-06, "loss": 0.0212, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.026477422565221786, "mask/share_reasoning": 0.8645951747894287, "mask/share_step_conf": 0.10892736911773682, "num_tokens": 3776834.0, "reward": 0.2704630196094513, "reward_std": 0.17843686044216156, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6428714990615845, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": -0.41366422176361084, "step": 16 }, { "adv/mean_abs_final_conf": 0.7551780939102173, "adv/mean_abs_reasoning": 0.42120128870010376, "adv/mean_abs_step_conf": 0.7566851377487183, "adv/ratio_final_to_reasoning": 1.7929149652908725, "adv/ratio_step_to_reasoning": 1.7964929311683082, "adv/std_final_conf": 0.9255588054656982, "adv/std_reasoning": 0.7013599872589111, "adv/std_step_conf": 0.9355192184448242, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5302579365079365, "calib/avg_num_step_conf": 5.328125, "calib/ece": 0.16675889328063234, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6442687747035574, "calib/gap": 0.009890046296296418, "calib/mean_conf": 0.9137944664031621, "calib/mu_c": 0.9162962962962964, "calib/mu_w": 0.90640625, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.16675889328063234, "calib/std_conf": 0.04511545629002079, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7666471734892788, "calib/step_q_c_n": 1026.0, "calib/step_q_gap": 0.026173800708213735, "calib/step_q_w": 0.7404733727810651, "calib/step_q_w_n": 338.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2091.0, "completions/max_terminated_length": 2091.0, "completions/mean_length": 518.14453125, "completions/mean_terminated_length": 518.14453125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.018133333333333335, "grad_norm": 0.05879347026348114, "kl": 0.002603292465209961, "learning_rate": 4.25e-06, "loss": 0.0993, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03295683488249779, "mask/share_reasoning": 0.8492019176483154, "mask/share_step_conf": 0.11784122884273529, "num_tokens": 4013007.0, "reward": 0.37962839007377625, "reward_std": 0.1868617981672287, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.764146089553833, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": -0.3478580713272095, "step": 17 }, { "adv/mean_abs_final_conf": 0.7532843947410583, "adv/mean_abs_reasoning": 0.35541170835494995, "adv/mean_abs_step_conf": 0.7603262066841125, "adv/ratio_final_to_reasoning": 2.1194698346537097, "adv/ratio_step_to_reasoning": 2.1392829465392125, "adv/std_final_conf": 0.9244809746742249, "adv/std_reasoning": 0.6612080335617065, "adv/std_step_conf": 0.9350612163543701, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4683257918552037, "calib/avg_num_step_conf": 5.41796875, "calib/ece": 0.39871485943775115, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.678714859437751, "calib/gap": -0.002163542340013147, "calib/mean_conf": 0.9208032128514058, "calib/mu_c": 0.9197692307692308, "calib/mu_w": 0.9219327731092439, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.39871485943775115, "calib/std_conf": 0.04024220633117022, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.74281098546042, "calib/step_q_c_n": 619.0, "calib/step_q_gap": 0.10115733962708673, "calib/step_q_w": 0.6416536458333333, "calib/step_q_w_n": 768.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2919.0, "completions/max_terminated_length": 2919.0, "completions/mean_length": 541.265625, "completions/mean_terminated_length": 541.265625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.0192, "grad_norm": 0.03853190690279007, "kl": 0.0032968521118164062, "learning_rate": 4.5e-06, "loss": 0.0616, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0334947407245636, "mask/share_reasoning": 0.8596949577331543, "mask/share_step_conf": 0.10681027919054031, "num_tokens": 4262291.0, "reward": 0.21546542644500732, "reward_std": 0.14310386776924133, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5657578706741333, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": -0.42701447010040283, "step": 18 }, { "adv/mean_abs_final_conf": 0.757287859916687, "adv/mean_abs_reasoning": 0.37841731309890747, "adv/mean_abs_step_conf": 0.7881565093994141, "adv/ratio_final_to_reasoning": 2.001197708728389, "adv/ratio_step_to_reasoning": 2.0827707457280438, "adv/std_final_conf": 0.9230532646179199, "adv/std_reasoning": 0.6611937284469604, "adv/std_step_conf": 0.9348154664039612, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.536557855626327, "calib/avg_num_step_conf": 4.39453125, "calib/ece": 0.29727272727272724, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7865612648221344, "calib/gap": 0.026543922505307904, "calib/mean_conf": 0.9178260869565217, "calib/mu_c": 0.9278980891719746, "calib/mu_w": 0.9013541666666667, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.29727272727272724, "calib/std_conf": 0.0817376587202468, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7642815249266862, "calib/step_q_c_n": 682.0, "calib/step_q_gap": 0.02811899670998197, "calib/step_q_w": 0.7361625282167042, "calib/step_q_w_n": 443.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2572.0, "completions/max_terminated_length": 2572.0, "completions/mean_length": 507.07421875, "completions/mean_terminated_length": 507.07421875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.020266666666666665, "grad_norm": 0.03658737987279892, "kl": 0.004879474639892578, "learning_rate": 4.75e-06, "loss": 0.052, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032567959278821945, "mask/share_reasoning": 0.8663681745529175, "mask/share_step_conf": 0.10106386244297028, "num_tokens": 4496862.0, "reward": 0.3029266595840454, "reward_std": 0.17736773192882538, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6701222658157349, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": -0.3838001787662506, "step": 19 }, { "adv/mean_abs_final_conf": 0.735227108001709, "adv/mean_abs_reasoning": 0.443066269159317, "adv/mean_abs_step_conf": 0.7690048813819885, "adv/ratio_final_to_reasoning": 1.6594066377400922, "adv/ratio_step_to_reasoning": 1.7356430288433242, "adv/std_final_conf": 0.9224920272827148, "adv/std_reasoning": 0.7391899824142456, "adv/std_step_conf": 0.9353877305984497, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.47368758002560823, "calib/avg_num_step_conf": 5.12109375, "calib/ece": 0.36718253968253967, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.8333333333333334, "calib/gap": 0.0013585147247117124, "calib/mean_conf": 0.930674603174603, "calib/mu_c": 0.9312676056338027, "calib/mu_w": 0.929909090909091, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.36718253968253967, "calib/std_conf": 0.040147808132678944, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7546852122986824, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.047981390642631294, "calib/step_q_w": 0.7067038216560511, "calib/step_q_w_n": 628.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2121.0, "completions/max_terminated_length": 2121.0, "completions/mean_length": 469.92578125, "completions/mean_terminated_length": 471.7686462402344, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.021333333333333333, "grad_norm": 0.0296552162617445, "kl": 0.007416725158691406, "learning_rate": 5e-06, "loss": 0.0436, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.037708643823862076, "mask/share_reasoning": 0.8331930637359619, "mask/share_step_conf": 0.1251920461654663, "num_tokens": 4722035.0, "reward": 0.2628268599510193, "reward_std": 0.190019890666008, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6044167876243591, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": -0.38501307368278503, "step": 20 }, { "adv/mean_abs_final_conf": 0.7824727892875671, "adv/mean_abs_reasoning": 0.5332362651824951, "adv/mean_abs_step_conf": 0.7937630414962769, "adv/ratio_final_to_reasoning": 1.4674035514440735, "adv/ratio_step_to_reasoning": 1.4885766278942392, "adv/std_final_conf": 0.9178428053855896, "adv/std_reasoning": 0.7575809359550476, "adv/std_step_conf": 0.9357240200042725, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5421815216709528, "calib/avg_num_step_conf": 4.96875, "calib/ece": 0.36324218750000004, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.9140625, "calib/gap": 0.007936398419368884, "calib/mean_conf": 0.9452734375, "calib/mu_c": 0.9485906040268455, "calib/mu_w": 0.9406542056074766, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36324218750000004, "calib/std_conf": 0.03473003357230731, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7154131054131054, "calib/step_q_c_n": 702.0, "calib/step_q_gap": 0.021465736992052897, "calib/step_q_w": 0.6939473684210525, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 470.1484375, "completions/mean_terminated_length": 471.9921875, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.0224, "grad_norm": 0.028702791780233383, "kl": 0.010184288024902344, "learning_rate": 4.9722222222222224e-06, "loss": -0.0438, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.035543717443943024, "mask/share_reasoning": 0.846480131149292, "mask/share_step_conf": 0.11406994611024857, "num_tokens": 4945353.0, "reward": 0.2834588885307312, "reward_std": 0.2273433953523636, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6274394989013672, "rewards/format_reward_step": 1.0, "rewards/step_l1_reward": -0.37692791223526, "step": 21 }, { "adv/mean_abs_final_conf": 0.7335162162780762, "adv/mean_abs_reasoning": 0.4049103260040283, "adv/mean_abs_step_conf": 0.7814478278160095, "adv/ratio_final_to_reasoning": 1.8115522602670762, "adv/ratio_step_to_reasoning": 1.9299281288475592, "adv/std_final_conf": 0.9055657386779785, "adv/std_reasoning": 0.6815546154975891, "adv/std_step_conf": 0.9354314804077148, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4629916819399408, "calib/avg_num_step_conf": 5.1640625, "calib/ece": 0.26784313725490183, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9372549019607843, "calib/gap": -0.002974763851684603, "calib/mean_conf": 0.9462745098039216, "calib/mu_c": 0.9453179190751445, "calib/mu_w": 0.9482926829268291, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26784313725490183, "calib/std_conf": 0.023192357117836803, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.715356738391846, "calib/step_q_c_n": 883.0, "calib/step_q_gap": 0.014670178027381242, "calib/step_q_w": 0.7006865603644647, "calib/step_q_w_n": 439.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1195.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 445.1640625, "completions/mean_terminated_length": 446.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.023466666666666667, "grad_norm": 0.03779933229088783, "kl": 0.012391090393066406, "learning_rate": 4.944444444444445e-06, "loss": 0.0277, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0352453738451004, "mask/share_reasoning": 0.8380044102668762, "mask/share_step_conf": 0.12284398078918457, "num_tokens": 5161131.0, "reward": 0.3495637774467468, "reward_std": 0.18510903418064117, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.705495297908783, "rewards/format_reward_step": 0.99609375, "rewards/step_l1_reward": -0.3407427668571472, "step": 22 }, { "adv/mean_abs_final_conf": 0.7578845024108887, "adv/mean_abs_reasoning": 0.5459873676300049, "adv/mean_abs_step_conf": 0.7861717939376831, "adv/ratio_final_to_reasoning": 1.3880989695799675, "adv/ratio_step_to_reasoning": 1.4399083944931894, "adv/std_final_conf": 0.9229386448860168, "adv/std_reasoning": 0.79274982213974, "adv/std_step_conf": 0.9354714155197144, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.539328626285148, "calib/avg_num_step_conf": 4.9140625, "calib/ece": 0.418, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9686274509803922, "calib/gap": 0.0005611296915644104, "calib/mean_conf": 0.954235294117647, "calib/mu_c": 0.9544927536231882, "calib/mu_w": 0.9539316239316238, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.41552941176470587, "calib/std_conf": 0.025593035384799748, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7103669724770642, "calib/step_q_c_n": 654.0, "calib/step_q_gap": 0.02596299896713039, "calib/step_q_w": 0.6844039735099338, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2725.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 496.51171875, "completions/mean_terminated_length": 496.51171875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.024533333333333334, "grad_norm": 0.02881755866110325, "kl": 0.014128684997558594, "learning_rate": 4.9166666666666665e-06, "loss": 0.0437, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03612464666366577, "mask/share_reasoning": 0.8429062366485596, "mask/share_step_conf": 0.12096910178661346, "num_tokens": 5392174.0, "reward": 0.23914536833763123, "reward_std": 0.21841177344322205, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5744590163230896, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": -0.4016370177268982, "step": 23 }, { "adv/mean_abs_final_conf": 0.7407926321029663, "adv/mean_abs_reasoning": 0.5218784809112549, "adv/mean_abs_step_conf": 0.7790415287017822, "adv/ratio_final_to_reasoning": 1.4194734199606471, "adv/ratio_step_to_reasoning": 1.4927642299822241, "adv/std_final_conf": 0.9203848838806152, "adv/std_reasoning": 0.7753624320030212, "adv/std_step_conf": 0.9355775117874146, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4989289314516129, "calib/avg_num_step_conf": 5.484375, "calib/ece": 0.4689285714285714, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9603174603174603, "calib/gap": -0.007467237903225898, "calib/mean_conf": 0.9523412698412698, "calib/mu_c": 0.9485483870967741, "calib/mu_w": 0.956015625, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.4646031746031746, "calib/std_conf": 0.05403256847060262, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6813832853025936, "calib/step_q_c_n": 694.0, "calib/step_q_gap": 0.04469314445752326, "calib/step_q_w": 0.6366901408450704, "calib/step_q_w_n": 710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2591.0, "completions/max_terminated_length": 2591.0, "completions/mean_length": 548.1875, "completions/mean_terminated_length": 548.1875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.0256, "grad_norm": 0.04038697108626366, "kl": 0.019349098205566406, "learning_rate": 4.888888888888889e-06, "loss": 0.0611, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03291749954223633, "mask/share_reasoning": 0.8484556674957275, "mask/share_step_conf": 0.11862681806087494, "num_tokens": 5637022.0, "reward": 0.21600434184074402, "reward_std": 0.22113731503486633, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.5229433178901672, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": -0.383903443813324, "step": 24 }, { "adv/mean_abs_final_conf": 0.72950679063797, "adv/mean_abs_reasoning": 0.40538179874420166, "adv/mean_abs_step_conf": 0.7633631229400635, "adv/ratio_final_to_reasoning": 1.7995548712296605, "adv/ratio_step_to_reasoning": 1.8830720207587568, "adv/std_final_conf": 0.90348219871521, "adv/std_reasoning": 0.6816384196281433, "adv/std_step_conf": 0.9356397390365601, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5344110115236875, "calib/avg_num_step_conf": 5.32421875, "calib/ece": 0.3967460317460318, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9801587301587301, "calib/gap": 0.0036491677336746298, "calib/mean_conf": 0.9602380952380952, "calib/mu_c": 0.9618309859154929, "calib/mu_w": 0.9581818181818182, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3967460317460318, "calib/std_conf": 0.02092937048152271, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6436760563380282, "calib/step_q_c_n": 710.0, "calib/step_q_gap": -0.0035827491749886864, "calib/step_q_w": 0.6472588055130168, "calib/step_q_w_n": 653.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2419.0, "completions/max_terminated_length": 2419.0, "completions/mean_length": 481.6328125, "completions/mean_terminated_length": 483.5216064453125, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.02666666666666667, "grad_norm": 0.028889209032058716, "kl": 0.019657135009765625, "learning_rate": 4.861111111111111e-06, "loss": 0.0584, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03381389379501343, "mask/share_reasoning": 0.8391128182411194, "mask/share_step_conf": 0.1231670081615448, "num_tokens": 5863544.0, "reward": 0.2530894875526428, "reward_std": 0.18848128616809845, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5886375308036804, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": -0.39027103781700134, "step": 25 }, { "adv/mean_abs_final_conf": 0.7045278549194336, "adv/mean_abs_reasoning": 0.38574790954589844, "adv/mean_abs_step_conf": 0.7433183193206787, "adv/ratio_final_to_reasoning": 1.82639448584129, "adv/ratio_step_to_reasoning": 1.9269535904827362, "adv/std_final_conf": 0.8941695690155029, "adv/std_reasoning": 0.681533932685852, "adv/std_step_conf": 0.935562252998352, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5127207737594617, "calib/avg_num_step_conf": 4.66796875, "calib/ece": 0.3098406374501991, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": 7.91982057750662e-05, "calib/mean_conf": 0.9623505976095617, "calib/mu_c": 0.9623780487804877, "calib/mu_w": 0.9622988505747126, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3094023904382469, "calib/std_conf": 0.018175017595124958, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6774024226110363, "calib/step_q_c_n": 743.0, "calib/step_q_gap": 0.036849325265903565, "calib/step_q_w": 0.6405530973451328, "calib/step_q_w_n": 452.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2450.0, "completions/max_terminated_length": 2450.0, "completions/mean_length": 480.02734375, "completions/mean_terminated_length": 483.8070983886719, "completions/min_length": 0.0, "completions/min_terminated_length": 214.0, "epoch": 0.027733333333333332, "grad_norm": 0.19219711422920227, "kl": 0.1522216796875, "learning_rate": 4.833333333333333e-06, "loss": -0.0185, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03214826434850693, "mask/share_reasoning": 0.8559674024581909, "mask/share_step_conf": 0.10407190024852753, "num_tokens": 6091671.0, "reward": 0.32004281878471375, "reward_std": 0.1753888726234436, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6643816232681274, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": -0.3477335572242737, "step": 26 }, { "adv/mean_abs_final_conf": 0.768155574798584, "adv/mean_abs_reasoning": 0.5936883687973022, "adv/mean_abs_step_conf": 0.7633221745491028, "adv/ratio_final_to_reasoning": 1.2938700085277375, "adv/ratio_step_to_reasoning": 1.285728699882475, "adv/std_final_conf": 0.9210662245750427, "adv/std_reasoning": 0.8098739385604858, "adv/std_step_conf": 0.9356999397277832, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4615677321156773, "calib/avg_num_step_conf": 5.1328125, "calib/ece": 0.3874409448818897, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9645669291338582, "calib/gap": -0.0015956367326230847, "calib/mean_conf": 0.9538976377952756, "calib/mu_c": 0.9532191780821919, "calib/mu_w": 0.954814814814815, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.383267716535433, "calib/std_conf": 0.07715515961902557, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6421166892808684, "calib/step_q_c_n": 737.0, "calib/step_q_gap": 0.005686879922116139, "calib/step_q_w": 0.6364298093587523, "calib/step_q_w_n": 577.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2831.0, "completions/max_terminated_length": 2831.0, "completions/mean_length": 475.65625, "completions/mean_terminated_length": 477.5216064453125, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.0288, "grad_norm": 0.03019108809530735, "kl": 0.025396347045898438, "learning_rate": 4.805555555555556e-06, "loss": -0.0089, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034386180341243744, "mask/share_reasoning": 0.8439096212387085, "mask/share_step_conf": 0.11779787391424179, "num_tokens": 6318655.0, "reward": 0.26763755083084106, "reward_std": 0.2534090578556061, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6004222631454468, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": -0.37764716148376465, "step": 27 }, { "adv/mean_abs_final_conf": 0.7378636002540588, "adv/mean_abs_reasoning": 0.3979493975639343, "adv/mean_abs_step_conf": 0.7608801126480103, "adv/ratio_final_to_reasoning": 1.8541643856503491, "adv/ratio_step_to_reasoning": 1.9120021724012477, "adv/std_final_conf": 0.8981975317001343, "adv/std_reasoning": 0.6817038059234619, "adv/std_step_conf": 0.9358172416687012, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5226771021473008, "calib/avg_num_step_conf": 4.57421875, "calib/ece": 0.3626000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.988, "calib/gap": 0.0027326242558030067, "calib/mean_conf": 0.9665999999999999, "calib/mu_c": 0.967682119205298, "calib/mu_w": 0.964949494949495, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.3626000000000001, "calib/std_conf": 0.01988064385275286, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6198295454545455, "calib/step_q_c_n": 704.0, "calib/step_q_gap": -0.018564458828109753, "calib/step_q_w": 0.6383940042826552, "calib/step_q_w_n": 467.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2692.0, "completions/max_terminated_length": 2692.0, "completions/mean_length": 526.57421875, "completions/mean_terminated_length": 528.6392211914062, "completions/min_length": 0.0, "completions/min_terminated_length": 195.0, "epoch": 0.029866666666666666, "grad_norm": 0.028602536767721176, "kl": 0.024019241333007812, "learning_rate": 4.777777777777778e-06, "loss": 0.0132, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03136756643652916, "mask/share_reasoning": 0.8636547923088074, "mask/share_step_conf": 0.10107140243053436, "num_tokens": 6560402.0, "reward": 0.2832421660423279, "reward_std": 0.17565187811851501, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6112655997276306, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": -0.3565000593662262, "step": 28 }, { "adv/mean_abs_final_conf": 0.7392631769180298, "adv/mean_abs_reasoning": 0.4967987537384033, "adv/mean_abs_step_conf": 0.7861140966415405, "adv/ratio_final_to_reasoning": 1.4880536059220866, "adv/ratio_step_to_reasoning": 1.5823592364635448, "adv/std_final_conf": 0.9159027934074402, "adv/std_reasoning": 0.7575135827064514, "adv/std_step_conf": 0.935678243637085, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5181607901975493, "calib/avg_num_step_conf": 5.8515625, "calib/ece": 0.4582608695652173, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00046886721680405863, "calib/mean_conf": 0.9681422924901185, "calib/mu_c": 0.9683720930232558, "calib/mu_w": 0.9679032258064517, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4582608695652173, "calib/std_conf": 0.017105448068898105, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5549710982658959, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.01014479553636738, "calib/step_q_w": 0.5448263027295285, "calib/step_q_w_n": 806.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2333.0, "completions/max_terminated_length": 2333.0, "completions/mean_length": 551.26953125, "completions/mean_terminated_length": 555.6102294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.030933333333333334, "grad_norm": 0.028186749666929245, "kl": 0.02429962158203125, "learning_rate": 4.75e-06, "loss": -0.0722, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02976279705762863, "mask/share_reasoning": 0.8515908122062683, "mask/share_step_conf": 0.11083388328552246, "num_tokens": 6808655.0, "reward": 0.23735710978507996, "reward_std": 0.20843389630317688, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5337077975273132, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": -0.3574310839176178, "step": 29 }, { "adv/mean_abs_final_conf": 0.7558501958847046, "adv/mean_abs_reasoning": 0.50095134973526, "adv/mean_abs_step_conf": 0.7730767726898193, "adv/ratio_final_to_reasoning": 1.5088295425976037, "adv/ratio_step_to_reasoning": 1.5432172667033848, "adv/std_final_conf": 0.9127808809280396, "adv/std_reasoning": 0.7576369643211365, "adv/std_step_conf": 0.9355639219284058, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4106060606060606, "calib/avg_num_step_conf": 5.23828125, "calib/ece": 0.39569721115537865, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9760956175298805, "calib/gap": 0.012989039329464935, "calib/mean_conf": 0.9540239043824702, "calib/mu_c": 0.9597163120567376, "calib/mu_w": 0.9467272727272726, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3939840637450201, "calib/std_conf": 0.09227181085240978, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5691840455840456, "calib/step_q_c_n": 702.0, "calib/step_q_gap": 0.0016879579471128503, "calib/step_q_w": 0.5674960876369327, "calib/step_q_w_n": 639.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2748.0, "completions/max_terminated_length": 2748.0, "completions/mean_length": 574.0546875, "completions/mean_terminated_length": 576.305908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.032, "grad_norm": 0.025844834744930267, "kl": 0.025775909423828125, "learning_rate": 4.722222222222222e-06, "loss": 0.0335, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.029186809435486794, "mask/share_reasoning": 0.8634512424468994, "mask/share_step_conf": 0.10345575958490372, "num_tokens": 7062597.0, "reward": 0.2614104151725769, "reward_std": 0.21709409356117249, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5859875082969666, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": -0.368635356426239, "step": 30 }, { "adv/mean_abs_final_conf": 0.7692661881446838, "adv/mean_abs_reasoning": 0.5111313462257385, "adv/mean_abs_step_conf": 0.7831858396530151, "adv/ratio_final_to_reasoning": 1.5050264356217773, "adv/ratio_step_to_reasoning": 1.5322594582315543, "adv/std_final_conf": 0.9017819762229919, "adv/std_reasoning": 0.7577013373374939, "adv/std_step_conf": 0.9357402324676514, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5134594882729211, "calib/avg_num_step_conf": 5.609375, "calib/ece": 0.506910569105691, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9878048780487805, "calib/gap": 0.009518923240938282, "calib/mean_conf": 0.9609756097560977, "calib/mu_c": 0.9661607142857144, "calib/mu_w": 0.9566417910447761, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.50630081300813, "calib/std_conf": 0.0653248870533324, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.605117540687161, "calib/step_q_c_n": 553.0, "calib/step_q_gap": 0.08459658938478276, "calib/step_q_w": 0.5205209513023782, "calib/step_q_w_n": 883.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2875.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 617.109375, "completions/mean_terminated_length": 621.968505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.03306666666666667, "grad_norm": 0.023726139217615128, "kl": 0.0217742919921875, "learning_rate": 4.694444444444445e-06, "loss": -0.017, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.029293062165379524, "mask/share_reasoning": 0.8607058525085449, "mask/share_step_conf": 0.1021885871887207, "num_tokens": 7326489.0, "reward": 0.19026575982570648, "reward_std": 0.21116501092910767, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.47342658042907715, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": -0.37180131673812866, "step": 31 }, { "adv/mean_abs_final_conf": 0.7366917133331299, "adv/mean_abs_reasoning": 0.40366196632385254, "adv/mean_abs_step_conf": 0.7732025384902954, "adv/ratio_final_to_reasoning": 1.8250213663729025, "adv/ratio_step_to_reasoning": 1.915470376195823, "adv/std_final_conf": 0.8951123952865601, "adv/std_reasoning": 0.6816351413726807, "adv/std_step_conf": 0.9357044696807861, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5704663544807236, "calib/avg_num_step_conf": 4.9140625, "calib/ece": 0.43972000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.988, "calib/gap": 0.01046699595868883, "calib/mean_conf": 0.96372, "calib/mu_c": 0.9687022900763358, "calib/mu_w": 0.958235294117647, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.43972000000000006, "calib/std_conf": 0.03922450254624015, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5661003262642741, "calib/step_q_c_n": 613.0, "calib/step_q_gap": 0.016999551070475682, "calib/step_q_w": 0.5491007751937984, "calib/step_q_w_n": 645.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2225.0, "completions/max_terminated_length": 2225.0, "completions/mean_length": 513.23046875, "completions/mean_terminated_length": 519.3162231445312, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.034133333333333335, "grad_norm": 0.029085859656333923, "kl": 0.029659271240234375, "learning_rate": 4.666666666666667e-06, "loss": 0.0095, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03149807080626488, "mask/share_reasoning": 0.8505294919013977, "mask/share_step_conf": 0.1062537431716919, "num_tokens": 7564580.0, "reward": 0.23524346947669983, "reward_std": 0.17312288284301758, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.547758936882019, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": -0.3749282956123352, "step": 32 }, { "adv/mean_abs_final_conf": 0.741101861000061, "adv/mean_abs_reasoning": 0.49518057703971863, "adv/mean_abs_step_conf": 0.7677923440933228, "adv/ratio_final_to_reasoning": 1.4966295031814565, "adv/ratio_step_to_reasoning": 1.5505300080292483, "adv/std_final_conf": 0.8996115326881409, "adv/std_reasoning": 0.7576470971107483, "adv/std_step_conf": 0.9357903599739075, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5124449053668654, "calib/avg_num_step_conf": 5.27734375, "calib/ece": 0.42674698795180727, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9759036144578314, "calib/gap": 0.010530852994555251, "calib/mean_conf": 0.9608835341365461, "calib/mu_c": 0.9657894736842104, "calib/mu_w": 0.9552586206896552, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.42674698795180727, "calib/std_conf": 0.05386304298457405, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5570839260312944, "calib/step_q_c_n": 703.0, "calib/step_q_gap": 0.03455306183376361, "calib/step_q_w": 0.5225308641975308, "calib/step_q_w_n": 648.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2667.0, "completions/max_terminated_length": 2667.0, "completions/mean_length": 533.4453125, "completions/mean_terminated_length": 539.770751953125, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.0352, "grad_norm": 0.023499513044953346, "kl": 0.03248023986816406, "learning_rate": 4.638888888888889e-06, "loss": 0.0092, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.030111689120531082, "mask/share_reasoning": 0.8539035320281982, "mask/share_step_conf": 0.10426604002714157, "num_tokens": 7808014.0, "reward": 0.24479824304580688, "reward_std": 0.20775847136974335, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5557679533958435, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": -0.3646090030670166, "step": 33 }, { "adv/mean_abs_final_conf": 0.7225061655044556, "adv/mean_abs_reasoning": 0.5000270009040833, "adv/mean_abs_step_conf": 0.7667113542556763, "adv/ratio_final_to_reasoning": 1.4449343019439242, "adv/ratio_step_to_reasoning": 1.533339905383928, "adv/std_final_conf": 0.9046749472618103, "adv/std_reasoning": 0.7575814723968506, "adv/std_step_conf": 0.9356700778007507, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.47993767043241137, "calib/avg_num_step_conf": 5.6953125, "calib/ece": 0.36723320158102757, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9723320158102767, "calib/gap": 0.0009076743280094224, "calib/mean_conf": 0.9640711462450593, "calib/mu_c": 0.9644370860927152, "calib/mu_w": 0.9635294117647057, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36723320158102757, "calib/std_conf": 0.02644307177563116, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5078490566037736, "calib/step_q_c_n": 795.0, "calib/step_q_gap": 0.025209539258373825, "calib/step_q_w": 0.4826395173453997, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2824.0, "completions/max_terminated_length": 2824.0, "completions/mean_length": 489.72265625, "completions/mean_terminated_length": 491.6431579589844, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.03626666666666667, "grad_norm": 0.024688169360160828, "kl": 0.03778839111328125, "learning_rate": 4.611111111111112e-06, "loss": 0.0327, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03325295448303223, "mask/share_reasoning": 0.837381899356842, "mask/share_step_conf": 0.12545892596244812, "num_tokens": 8038495.0, "reward": 0.29707586765289307, "reward_std": 0.21303704380989075, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6169394254684448, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": -0.33841270208358765, "step": 34 }, { "adv/mean_abs_final_conf": 0.7423563003540039, "adv/mean_abs_reasoning": 0.49117863178253174, "adv/mean_abs_step_conf": 0.7686483860015869, "adv/ratio_final_to_reasoning": 1.511377434437499, "adv/ratio_step_to_reasoning": 1.564905996036701, "adv/std_final_conf": 0.899803638458252, "adv/std_reasoning": 0.7392747402191162, "adv/std_step_conf": 0.9357601404190063, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5246336996336997, "calib/avg_num_step_conf": 4.62890625, "calib/ece": 0.4491015624999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.9453125, "calib/gap": 0.016330891330891495, "calib/mean_conf": 0.9450390625000001, "calib/mu_c": 0.953076923076923, "calib/mu_w": 0.9367460317460315, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4431640624999999, "calib/std_conf": 0.12427287907713852, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5168055555555555, "calib/step_q_c_n": 576.0, "calib/step_q_gap": 0.0242111384783798, "calib/step_q_w": 0.4925944170771757, "calib/step_q_w_n": 609.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 538.1015625, "completions/mean_terminated_length": 540.2117919921875, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.037333333333333336, "grad_norm": 0.026421768590807915, "kl": 0.03264617919921875, "learning_rate": 4.583333333333333e-06, "loss": 0.0089, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.030006268993020058, "mask/share_reasoning": 0.874493420124054, "mask/share_step_conf": 0.09159402549266815, "num_tokens": 8285505.0, "reward": 0.24800075590610504, "reward_std": 0.21784111857414246, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5516136884689331, "rewards/format_reward_step": 1.0, "rewards/step_l1_reward": -0.3571746349334717, "step": 35 }, { "adv/mean_abs_final_conf": 0.6796886324882507, "adv/mean_abs_reasoning": 0.3737061619758606, "adv/mean_abs_step_conf": 0.7662829160690308, "adv/ratio_final_to_reasoning": 1.8187782317920542, "adv/ratio_step_to_reasoning": 2.0504958013470715, "adv/std_final_conf": 0.8769128322601318, "adv/std_reasoning": 0.6815247535705566, "adv/std_step_conf": 0.9357625842094421, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5590420081967213, "calib/avg_num_step_conf": 4.87109375, "calib/ece": 0.20944664031620558, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9841897233201581, "calib/gap": 0.0005729166666664787, "calib/mean_conf": 0.9604347826086956, "calib/mu_c": 0.9605729166666667, "calib/mu_w": 0.9600000000000002, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2054940711462451, "calib/std_conf": 0.06510715320812133, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.502905982905983, "calib/step_q_c_n": 936.0, "calib/step_q_gap": 0.017246818918844664, "calib/step_q_w": 0.4856591639871383, "calib/step_q_w_n": 311.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 489.90625, "completions/mean_terminated_length": 491.8274841308594, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.0384, "grad_norm": 0.03915192931890488, "kl": 0.039752960205078125, "learning_rate": 4.555555555555556e-06, "loss": 0.0076, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0351969450712204, "mask/share_reasoning": 0.8441604375839233, "mask/share_step_conf": 0.11673638969659805, "num_tokens": 8513633.0, "reward": 0.3843995928764343, "reward_std": 0.17076198756694794, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.7632484436035156, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": -0.34132421016693115, "step": 36 }, { "adv/mean_abs_final_conf": 0.7042989730834961, "adv/mean_abs_reasoning": 0.38596227765083313, "adv/mean_abs_step_conf": 0.7719683647155762, "adv/ratio_final_to_reasoning": 1.8247870682342986, "adv/ratio_step_to_reasoning": 2.000113506983575, "adv/std_final_conf": 0.8864962458610535, "adv/std_reasoning": 0.6613255143165588, "adv/std_step_conf": 0.9358731508255005, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.585840108401084, "calib/avg_num_step_conf": 5.44921875, "calib/ece": 0.47028806584362165, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9876543209876543, "calib/gap": 0.013930894308943231, "calib/mean_conf": 0.9641152263374487, "calib/mu_c": 0.9711666666666667, "calib/mu_w": 0.9572357723577235, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.47028806584362165, "calib/std_conf": 0.06343671788141357, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5181692573402418, "calib/step_q_c_n": 579.0, "calib/step_q_gap": 0.1321275906735751, "calib/step_q_w": 0.38604166666666667, "calib/step_q_w_n": 816.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 582.39453125, "completions/mean_terminated_length": 589.3004150390625, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.039466666666666664, "grad_norm": 0.028868885710835457, "kl": 0.0316619873046875, "learning_rate": 4.527777777777778e-06, "loss": 0.0207, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.030005447566509247, "mask/share_reasoning": 0.8539750576019287, "mask/share_step_conf": 0.10430075228214264, "num_tokens": 8769822.0, "reward": 0.22407503426074982, "reward_std": 0.1822788268327713, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.5044206976890564, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": -0.33908313512802124, "step": 37 }, { "adv/mean_abs_final_conf": 0.7056914567947388, "adv/mean_abs_reasoning": 0.43036288022994995, "adv/mean_abs_step_conf": 0.772477388381958, "adv/ratio_final_to_reasoning": 1.6397591177419304, "adv/ratio_step_to_reasoning": 1.7949442757916543, "adv/std_final_conf": 0.8990572094917297, "adv/std_reasoning": 0.7206587195396423, "adv/std_step_conf": 0.9357432126998901, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5018480628341364, "calib/avg_num_step_conf": 4.80078125, "calib/ece": 0.40133064516129036, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9838709677419355, "calib/gap": 0.008475348161837482, "calib/mean_conf": 0.9618145161290323, "calib/mu_c": 0.9655395683453236, "calib/mu_w": 0.9570642201834861, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.40133064516129036, "calib/std_conf": 0.052505125239149485, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4564380566801619, "calib/step_q_c_n": 741.0, "calib/step_q_gap": -0.006372189221477476, "calib/step_q_w": 0.4628102459016394, "calib/step_q_w_n": 488.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2779.0, "completions/max_terminated_length": 2779.0, "completions/mean_length": 543.45703125, "completions/mean_terminated_length": 543.45703125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.04053333333333333, "grad_norm": 0.026996370404958725, "kl": 0.0350799560546875, "learning_rate": 4.5e-06, "loss": -0.0046, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03184231370687485, "mask/share_reasoning": 0.8610826134681702, "mask/share_step_conf": 0.10707508027553558, "num_tokens": 9015835.0, "reward": 0.26468414068222046, "reward_std": 0.19265443086624146, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5750671625137329, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": -0.3472614288330078, "step": 38 }, { "adv/mean_abs_final_conf": 0.7193535566329956, "adv/mean_abs_reasoning": 0.38643133640289307, "adv/mean_abs_step_conf": 0.7790340185165405, "adv/ratio_final_to_reasoning": 1.861530080166682, "adv/ratio_step_to_reasoning": 2.01597009644249, "adv/std_final_conf": 0.898597776889801, "adv/std_reasoning": 0.6612383723258972, "adv/std_step_conf": 0.9355990886688232, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5419240127269324, "calib/avg_num_step_conf": 4.91015625, "calib/ece": 0.43409448818897645, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9763779527559056, "calib/gap": -0.0058493979661859985, "calib/mean_conf": 0.9629133858267716, "calib/mu_c": 0.9602189781021898, "calib/mu_w": 0.9660683760683758, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.42881889763779535, "calib/std_conf": 0.0665944387290601, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48212598425196845, "calib/step_q_c_n": 635.0, "calib/step_q_gap": 0.022833379750360816, "calib/step_q_w": 0.45929260450160764, "calib/step_q_w_n": 622.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1228.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 475.38671875, "completions/mean_terminated_length": 477.2510070800781, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.0416, "grad_norm": 0.024599267169833183, "kl": 0.037715911865234375, "learning_rate": 4.472222222222223e-06, "loss": -0.0049, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03345062956213951, "mask/share_reasoning": 0.8500820994377136, "mask/share_step_conf": 0.11256100237369537, "num_tokens": 9243622.0, "reward": 0.25512662529945374, "reward_std": 0.17816153168678284, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5565023422241211, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": -0.3509365916252136, "step": 39 }, { "adv/mean_abs_final_conf": 0.7258785367012024, "adv/mean_abs_reasoning": 0.440116286277771, "adv/mean_abs_step_conf": 0.7749820947647095, "adv/ratio_final_to_reasoning": 1.6492880616625898, "adv/ratio_step_to_reasoning": 1.7608575708911491, "adv/std_final_conf": 0.9175970554351807, "adv/std_reasoning": 0.7205471396446228, "adv/std_step_conf": 0.9358422756195068, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4996827813729222, "calib/avg_num_step_conf": 4.66796875, "calib/ece": 0.5257308300395259, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9525691699604744, "calib/gap": -0.003204612358837622, "calib/mean_conf": 0.9487355731225297, "calib/mu_c": 0.9469369369369369, "calib/mu_w": 0.9501415492957745, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.5178656126482216, "calib/std_conf": 0.12410322618837606, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4709760956175299, "calib/step_q_c_n": 502.0, "calib/step_q_gap": 0.05189961654105091, "calib/step_q_w": 0.419076479076479, "calib/step_q_w_n": 693.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2952.0, "completions/max_terminated_length": 2952.0, "completions/mean_length": 545.328125, "completions/mean_terminated_length": 545.328125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.042666666666666665, "grad_norm": 0.03541141003370285, "kl": 0.043437957763671875, "learning_rate": 4.444444444444444e-06, "loss": -0.0105, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03277438133955002, "mask/share_reasoning": 0.8654713034629822, "mask/share_step_conf": 0.1017543375492096, "num_tokens": 9489986.0, "reward": 0.19916599988937378, "reward_std": 0.19551372528076172, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.4707808494567871, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": -0.3560425937175751, "step": 40 }, { "adv/mean_abs_final_conf": 0.6797932386398315, "adv/mean_abs_reasoning": 0.3944539725780487, "adv/mean_abs_step_conf": 0.7641567587852478, "adv/ratio_final_to_reasoning": 1.7233778486166067, "adv/ratio_step_to_reasoning": 1.9372520291554367, "adv/std_final_conf": 0.8657556772232056, "adv/std_reasoning": 0.6815750002861023, "adv/std_step_conf": 0.9356968998908997, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6030844155844156, "calib/avg_num_step_conf": 4.796875, "calib/ece": 0.1837007874015749, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9488188976377953, "calib/gap": 0.05154401154401156, "calib/mean_conf": 0.9473228346456694, "calib/mu_c": 0.9586868686868687, "calib/mu_w": 0.9071428571428571, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17574803149606308, "calib/std_conf": 0.127148922660387, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4666802030456853, "calib/step_q_c_n": 985.0, "calib/step_q_gap": 0.0015773223872491693, "calib/step_q_w": 0.46510288065843614, "calib/step_q_w_n": 243.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2506.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 481.74609375, "completions/mean_terminated_length": 481.74609375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.04373333333333333, "grad_norm": 0.036088407039642334, "kl": 0.041255950927734375, "learning_rate": 4.416666666666667e-06, "loss": -0.0088, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03467945009469986, "mask/share_reasoning": 0.8499183654785156, "mask/share_step_conf": 0.1154022216796875, "num_tokens": 9720561.0, "reward": 0.40249550342559814, "reward_std": 0.17057295143604279, "rewards/accuracy_reward_step": 0.7734375, "rewards/final_brier_reward_step": 0.7933316826820374, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": -0.34068435430526733, "step": 41 }, { "adv/mean_abs_final_conf": 0.7007678747177124, "adv/mean_abs_reasoning": 0.38961368799209595, "adv/mean_abs_step_conf": 0.7727140188217163, "adv/ratio_final_to_reasoning": 1.798622318248554, "adv/ratio_step_to_reasoning": 1.9832825248105561, "adv/std_final_conf": 0.8853124976158142, "adv/std_reasoning": 0.6815693974494934, "adv/std_step_conf": 0.9356843829154968, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5324923547400612, "calib/avg_num_step_conf": 5.12109375, "calib/ece": 0.39150197628458505, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9762845849802372, "calib/gap": 0.020845438328236265, "calib/mean_conf": 0.9606719367588933, "calib/mu_c": 0.9696527777777777, "calib/mu_w": 0.9488073394495414, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39150197628458505, "calib/std_conf": 0.08246177149388098, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4925208913649025, "calib/step_q_c_n": 718.0, "calib/step_q_gap": 0.04903016623842693, "calib/step_q_w": 0.4434907251264756, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2255.0, "completions/max_terminated_length": 2255.0, "completions/mean_length": 444.375, "completions/mean_terminated_length": 444.375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.0448, "grad_norm": 0.024456586688756943, "kl": 0.043132781982421875, "learning_rate": 4.388888888888889e-06, "loss": -0.0717, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03620729595422745, "mask/share_reasoning": 0.8431611061096191, "mask/share_step_conf": 0.12063158303499222, "num_tokens": 9938689.0, "reward": 0.2879253327846527, "reward_std": 0.17871731519699097, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5978449583053589, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": -0.3321504592895508, "step": 42 }, { "adv/mean_abs_final_conf": 0.7734534740447998, "adv/mean_abs_reasoning": 0.571959376335144, "adv/mean_abs_step_conf": 0.7615091800689697, "adv/ratio_final_to_reasoning": 1.352287428174949, "adv/ratio_step_to_reasoning": 1.3314043122229673, "adv/std_final_conf": 0.9020565152168274, "adv/std_reasoning": 0.7755232453346252, "adv/std_step_conf": 0.9354322552680969, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5376262626262626, "calib/avg_num_step_conf": 4.6171875, "calib/ece": 0.39578740157480324, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9448818897637795, "calib/gap": 0.009183080808080835, "calib/mean_conf": 0.9548425196850394, "calib/mu_c": 0.9588194444444444, "calib/mu_w": 0.9496363636363636, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3918503937007875, "calib/std_conf": 0.09487546127796324, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4688839285714285, "calib/step_q_c_n": 672.0, "calib/step_q_gap": 0.03392314425770304, "calib/step_q_w": 0.4349607843137255, "calib/step_q_w_n": 510.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 487.3359375, "completions/mean_terminated_length": 489.2471008300781, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.04586666666666667, "grad_norm": 0.03911694511771202, "kl": 0.039653778076171875, "learning_rate": 4.361111111111112e-06, "loss": -0.0361, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.033634115010499954, "mask/share_reasoning": 0.8571068644523621, "mask/share_step_conf": 0.10535275936126709, "num_tokens": 10168671.0, "reward": 0.2883046865463257, "reward_std": 0.23567301034927368, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5909242033958435, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": -0.324471116065979, "step": 43 }, { "adv/mean_abs_final_conf": 0.7604214549064636, "adv/mean_abs_reasoning": 0.47109296917915344, "adv/mean_abs_step_conf": 0.7571343183517456, "adv/ratio_final_to_reasoning": 1.6141643044077794, "adv/ratio_step_to_reasoning": 1.6071866232073029, "adv/std_final_conf": 0.9101094603538513, "adv/std_reasoning": 0.7392681837081909, "adv/std_step_conf": 0.9356369972229004, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5641794775653044, "calib/avg_num_step_conf": 4.9921875, "calib/ece": 0.4446245059288537, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9090909090909091, "calib/gap": 0.034324459442569455, "calib/mean_conf": 0.9280237154150198, "calib/mu_c": 0.9451181102362204, "calib/mu_w": 0.9107936507936509, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4353359683794466, "calib/std_conf": 0.1671832661987067, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46154589371980675, "calib/step_q_c_n": 621.0, "calib/step_q_gap": 0.04247435642909142, "calib/step_q_w": 0.41907153729071533, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2043.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 524.55078125, "completions/mean_terminated_length": 526.6078491210938, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.046933333333333334, "grad_norm": 0.02774551697075367, "kl": 0.0364837646484375, "learning_rate": 4.333333333333334e-06, "loss": 0.0266, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030666790902614594, "mask/share_reasoning": 0.863066554069519, "mask/share_step_conf": 0.10236036777496338, "num_tokens": 10409276.0, "reward": 0.2592760920524597, "reward_std": 0.21509483456611633, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5511636734008789, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": -0.3294864296913147, "step": 44 }, { "adv/mean_abs_final_conf": 0.7134293913841248, "adv/mean_abs_reasoning": 0.5957998633384705, "adv/mean_abs_step_conf": 0.7652695178985596, "adv/ratio_final_to_reasoning": 1.1974312773194271, "adv/ratio_step_to_reasoning": 1.284440573065077, "adv/std_final_conf": 0.9183647632598877, "adv/std_reasoning": 0.8429709672927856, "adv/std_step_conf": 0.9357597231864929, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5528199932455252, "calib/avg_num_step_conf": 4.99609375, "calib/ece": 0.38191056910569104, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9024390243902439, "calib/gap": 0.036455927051671666, "calib/mean_conf": 0.9339430894308943, "calib/mu_c": 0.9495035460992908, "calib/mu_w": 0.9130476190476191, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.3713414634146341, "calib/std_conf": 0.16610892698159163, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.46075, "calib/step_q_c_n": 720.0, "calib/step_q_gap": 0.04001654740608229, "calib/step_q_w": 0.4207334525939177, "calib/step_q_w_n": 559.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 545.0703125, "completions/mean_terminated_length": 545.0703125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.048, "grad_norm": 0.02425733208656311, "kl": 0.03783416748046875, "learning_rate": 4.305555555555556e-06, "loss": 0.1195, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.034552380442619324, "mask/share_reasoning": 0.8488144874572754, "mask/share_step_conf": 0.11663312464952469, "num_tokens": 10653862.0, "reward": 0.28766316175460815, "reward_std": 0.22457343339920044, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5913242101669312, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": -0.31756043434143066, "step": 45 }, { "adv/mean_abs_final_conf": 0.7256664037704468, "adv/mean_abs_reasoning": 0.42020779848098755, "adv/mean_abs_step_conf": 0.7499903440475464, "adv/ratio_final_to_reasoning": 1.726922742494699, "adv/ratio_step_to_reasoning": 1.7848082466786488, "adv/std_final_conf": 0.91023188829422, "adv/std_reasoning": 0.7013890147209167, "adv/std_step_conf": 0.9354733824729919, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6028079710144928, "calib/avg_num_step_conf": 5.28125, "calib/ece": 0.4146000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.944, "calib/gap": 0.0023071946169772595, "calib/mean_conf": 0.96172, "calib/mu_c": 0.9627536231884058, "calib/mu_w": 0.9604464285714286, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4121600000000001, "calib/std_conf": 0.04911050396809219, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47111267605633805, "calib/step_q_c_n": 710.0, "calib/step_q_gap": 0.07262357948312936, "calib/step_q_w": 0.3984890965732087, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3026.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 529.46484375, "completions/mean_terminated_length": 533.6338500976562, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.04906666666666667, "grad_norm": 0.026741398498415947, "kl": 0.0366668701171875, "learning_rate": 4.277777777777778e-06, "loss": -0.009, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.035755470395088196, "mask/share_reasoning": 0.8419286012649536, "mask/share_step_conf": 0.114503413438797, "num_tokens": 10894173.0, "reward": 0.2640422582626343, "reward_std": 0.17718663811683655, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5698855519294739, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": -0.34492605924606323, "step": 46 }, { "adv/mean_abs_final_conf": 0.705195426940918, "adv/mean_abs_reasoning": 0.385998398065567, "adv/mean_abs_step_conf": 0.7689273953437805, "adv/ratio_final_to_reasoning": 1.8269387398367676, "adv/ratio_step_to_reasoning": 1.9920481514878408, "adv/std_final_conf": 0.8877806663513184, "adv/std_reasoning": 0.6613624691963196, "adv/std_step_conf": 0.9357512593269348, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5420460816777042, "calib/avg_num_step_conf": 4.8046875, "calib/ece": 0.3363967611336034, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9190283400809717, "calib/gap": 0.035028973509933725, "calib/mean_conf": 0.9457894736842106, "calib/mu_c": 0.9594039735099338, "calib/mu_w": 0.9243750000000001, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.335425101214575, "calib/std_conf": 0.11577971986890179, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.44354309165526673, "calib/step_q_c_n": 731.0, "calib/step_q_gap": 0.01596794135466556, "calib/step_q_w": 0.42757515030060117, "calib/step_q_w_n": 499.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2732.0, "completions/max_terminated_length": 2732.0, "completions/mean_length": 533.7734375, "completions/mean_terminated_length": 537.9763793945312, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.050133333333333335, "grad_norm": 0.0264823567122221, "kl": 0.037876129150390625, "learning_rate": 4.25e-06, "loss": -0.0354, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03163876757025719, "mask/share_reasoning": 0.8584343194961548, "mask/share_step_conf": 0.10211436450481415, "num_tokens": 11136795.0, "reward": 0.30292263627052307, "reward_std": 0.16629716753959656, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.626888632774353, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": -0.33119967579841614, "step": 47 }, { "adv/mean_abs_final_conf": 0.7307820320129395, "adv/mean_abs_reasoning": 0.5326458215713501, "adv/mean_abs_step_conf": 0.7509745955467224, "adv/ratio_final_to_reasoning": 1.3719849145855887, "adv/ratio_step_to_reasoning": 1.4098948403111171, "adv/std_final_conf": 0.9083433151245117, "adv/std_reasoning": 0.7754582166671753, "adv/std_step_conf": 0.9357086420059204, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5705682619001857, "calib/avg_num_step_conf": 4.54296875, "calib/ece": 0.43048000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.904, "calib/gap": 0.020247293228265573, "calib/mean_conf": 0.93896, "calib/mu_c": 0.9487596899224805, "calib/mu_w": 0.9285123966942149, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.42672000000000004, "calib/std_conf": 0.11629496291757437, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.46645833333333336, "calib/step_q_c_n": 576.0, "calib/step_q_gap": 0.027105692788188585, "calib/step_q_w": 0.4393526405451448, "calib/step_q_w_n": 587.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3018.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 487.84765625, "completions/mean_terminated_length": 489.76080322265625, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.0512, "grad_norm": 0.028510455042123795, "kl": 0.040187835693359375, "learning_rate": 4.222222222222223e-06, "loss": 0.0294, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0355626717209816, "mask/share_reasoning": 0.8490986824035645, "mask/share_step_conf": 0.11143238097429276, "num_tokens": 11365372.0, "reward": 0.25411996245384216, "reward_std": 0.2041478008031845, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5546382665634155, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": -0.34249210357666016, "step": 48 }, { "adv/mean_abs_final_conf": 0.7097325921058655, "adv/mean_abs_reasoning": 0.409004807472229, "adv/mean_abs_step_conf": 0.7667136192321777, "adv/ratio_final_to_reasoning": 1.7352671145657759, "adv/ratio_step_to_reasoning": 1.8745833917471417, "adv/std_final_conf": 0.8881635665893555, "adv/std_reasoning": 0.6612932085990906, "adv/std_step_conf": 0.9354108572006226, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6371844660194176, "calib/avg_num_step_conf": 4.1953125, "calib/ece": 0.36671936758893287, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9051383399209486, "calib/gap": 0.026107443365695926, "calib/mean_conf": 0.9568379446640316, "calib/mu_c": 0.9674666666666667, "calib/mu_w": 0.9413592233009708, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.3653359683794467, "calib/std_conf": 0.06414750651290371, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.4735439137134052, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.0058027372428169355, "calib/step_q_w": 0.46774117647058827, "calib/step_q_w_n": 425.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 468.19140625, "completions/mean_terminated_length": 468.19140625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.05226666666666667, "grad_norm": 0.027232561260461807, "kl": 0.04390716552734375, "learning_rate": 4.194444444444445e-06, "loss": -0.0188, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.035721421241760254, "mask/share_reasoning": 0.8589103817939758, "mask/share_step_conf": 0.10536816716194153, "num_tokens": 11589765.0, "reward": 0.3063098192214966, "reward_std": 0.16605576872825623, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6214120984077454, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": -0.3197299838066101, "step": 49 }, { "adv/mean_abs_final_conf": 0.7404731512069702, "adv/mean_abs_reasoning": 0.41314762830734253, "adv/mean_abs_step_conf": 0.7684262990951538, "adv/ratio_final_to_reasoning": 1.7922725449028323, "adv/ratio_step_to_reasoning": 1.8599315267604968, "adv/std_final_conf": 0.8928567171096802, "adv/std_reasoning": 0.6613539457321167, "adv/std_step_conf": 0.9354705810546875, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5829973118279569, "calib/avg_num_step_conf": 4.6484375, "calib/ece": 0.3234387351778656, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.924901185770751, "calib/gap": 0.03274932795698937, "calib/mean_conf": 0.953399209486166, "calib/mu_c": 0.9654375000000002, "calib/mu_w": 0.9326881720430108, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3222134387351779, "calib/std_conf": 0.08894366205153649, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.48235543018335686, "calib/step_q_c_n": 709.0, "calib/step_q_gap": 0.049548777376704045, "calib/step_q_w": 0.4328066528066528, "calib/step_q_w_n": 481.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 478.578125, "completions/mean_terminated_length": 480.4549255371094, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.05333333333333334, "grad_norm": 0.023459725081920624, "kl": 0.042621612548828125, "learning_rate": 4.166666666666667e-06, "loss": -0.0499, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03598196431994438, "mask/share_reasoning": 0.8490711450576782, "mask/share_step_conf": 0.1110406219959259, "num_tokens": 11817641.0, "reward": 0.32591211795806885, "reward_std": 0.18485143780708313, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.656133234500885, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": -0.3261839747428894, "step": 50 }, { "adv/mean_abs_final_conf": 0.6921615600585938, "adv/mean_abs_reasoning": 0.459354043006897, "adv/mean_abs_step_conf": 0.7920522093772888, "adv/ratio_final_to_reasoning": 1.5068149950912728, "adv/ratio_step_to_reasoning": 1.7242739482438747, "adv/std_final_conf": 0.8642164468765259, "adv/std_reasoning": 0.7392639517784119, "adv/std_step_conf": 0.935680091381073, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6545796400752082, "calib/avg_num_step_conf": 3.96484375, "calib/ece": 0.3464516129032259, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8790322580645161, "calib/gap": 0.08866236905721214, "calib/mean_conf": 0.9257258064516131, "calib/mu_c": 0.9621917808219179, "calib/mu_w": 0.8735294117647058, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.34173387096774205, "calib/std_conf": 0.18032977581005438, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4720717781402936, "calib/step_q_c_n": 613.0, "calib/step_q_gap": 0.03057924082686081, "calib/step_q_w": 0.4414925373134328, "calib/step_q_w_n": 402.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2298.0, "completions/max_terminated_length": 2298.0, "completions/mean_length": 499.36328125, "completions/mean_terminated_length": 503.2952880859375, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.0544, "grad_norm": 0.02970803529024124, "kl": 0.04041290283203125, "learning_rate": 4.138888888888889e-06, "loss": 0.04, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03323691338300705, "mask/share_reasoning": 0.8658112287521362, "mask/share_step_conf": 0.0931393951177597, "num_tokens": 12054774.0, "reward": 0.3221244513988495, "reward_std": 0.17050030827522278, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6329628825187683, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": -0.2941827178001404, "step": 51 }, { "adv/mean_abs_final_conf": 0.729950487613678, "adv/mean_abs_reasoning": 0.4381468892097473, "adv/mean_abs_step_conf": 0.7624776363372803, "adv/ratio_final_to_reasoning": 1.6659949108168608, "adv/ratio_step_to_reasoning": 1.740232910731157, "adv/std_final_conf": 0.8969921469688416, "adv/std_reasoning": 0.7014232873916626, "adv/std_step_conf": 0.9353620409965515, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6346685082872928, "calib/avg_num_step_conf": 4.125, "calib/ece": 0.22924901185770752, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8695652173913043, "calib/gap": 0.08977747084100685, "calib/mean_conf": 0.9245059288537549, "calib/mu_c": 0.9500552486187845, "calib/mu_w": 0.8602777777777777, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.2191699604743083, "calib/std_conf": 0.17271905082592476, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.46316421895861143, "calib/step_q_c_n": 749.0, "calib/step_q_gap": 0.0243042841051912, "calib/step_q_w": 0.43885993485342023, "calib/step_q_w_n": 307.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1880.0, "completions/max_terminated_length": 1880.0, "completions/mean_length": 460.2890625, "completions/mean_terminated_length": 460.2890625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.055466666666666664, "grad_norm": 0.030791480094194412, "kl": 0.041439056396484375, "learning_rate": 4.111111111111111e-06, "loss": 0.0236, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.036113008856773376, "mask/share_reasoning": 0.861052393913269, "mask/share_step_conf": 0.10283458977937698, "num_tokens": 12280560.0, "reward": 0.3836786150932312, "reward_std": 0.18867164850234985, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7272488474845886, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": -0.29426658153533936, "step": 52 }, { "adv/mean_abs_final_conf": 0.7180808186531067, "adv/mean_abs_reasoning": 0.3494745194911957, "adv/mean_abs_step_conf": 0.7842679023742676, "adv/ratio_final_to_reasoning": 2.0547444194173856, "adv/ratio_step_to_reasoning": 2.244134718365428, "adv/std_final_conf": 0.8857766389846802, "adv/std_reasoning": 0.6403147578239441, "adv/std_step_conf": 0.9356178045272827, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5780065359477125, "calib/avg_num_step_conf": 4.1953125, "calib/ece": 0.3447619047619049, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8849206349206349, "calib/gap": 0.05336470588235298, "calib/mean_conf": 0.9366666666666668, "calib/mu_c": 0.9582666666666667, "calib/mu_w": 0.9049019607843137, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3430952380952382, "calib/std_conf": 0.1420065949731667, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.45656151419558355, "calib/step_q_c_n": 634.0, "calib/step_q_gap": -0.0026430312589619254, "calib/step_q_w": 0.4592045454545455, "calib/step_q_w_n": 440.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2807.0, "completions/max_terminated_length": 2807.0, "completions/mean_length": 475.59765625, "completions/mean_terminated_length": 477.4627685546875, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.05653333333333333, "grad_norm": 0.03513414040207863, "kl": 0.040142059326171875, "learning_rate": 4.083333333333334e-06, "loss": -0.0345, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03485337644815445, "mask/share_reasoning": 0.8665106296539307, "mask/share_step_conf": 0.09472978115081787, "num_tokens": 12508137.0, "reward": 0.30111369490623474, "reward_std": 0.19021180272102356, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.637919545173645, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": -0.3497546315193176, "step": 53 }, { "adv/mean_abs_final_conf": 0.6736820936203003, "adv/mean_abs_reasoning": 0.2930399179458618, "adv/mean_abs_step_conf": 0.784577488899231, "adv/ratio_final_to_reasoning": 2.298943087148833, "adv/ratio_step_to_reasoning": 2.677374107933579, "adv/std_final_conf": 0.8704360723495483, "adv/std_reasoning": 0.5726863145828247, "adv/std_step_conf": 0.9355136752128601, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6744080145719489, "calib/avg_num_step_conf": 4.109375, "calib/ece": 0.22658823529411773, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8980392156862745, "calib/gap": 0.11522996357012771, "calib/mean_conf": 0.9253333333333333, "calib/mu_c": 0.9578688524590165, "calib/mu_w": 0.8426388888888888, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21713725490196087, "calib/std_conf": 0.1841136335673148, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5020906432748539, "calib/step_q_c_n": 684.0, "calib/step_q_gap": 0.07888412153572338, "calib/step_q_w": 0.4232065217391305, "calib/step_q_w_n": 368.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1153.0, "completions/max_terminated_length": 1153.0, "completions/mean_length": 403.140625, "completions/mean_terminated_length": 404.7215881347656, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.0576, "grad_norm": 0.040213968604803085, "kl": 0.04549407958984375, "learning_rate": 4.055555555555556e-06, "loss": 0.0123, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04169517755508423, "mask/share_reasoning": 0.8462526202201843, "mask/share_step_conf": 0.10814596712589264, "num_tokens": 12717573.0, "reward": 0.3985499143600464, "reward_std": 0.1534053236246109, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7640405893325806, "rewards/format_reward_step": 0.99609375, "rewards/step_l1_reward": -0.3091282844543457, "step": 54 }, { "adv/mean_abs_final_conf": 0.6789047718048096, "adv/mean_abs_reasoning": 0.43928492069244385, "adv/mean_abs_step_conf": 0.763441801071167, "adv/ratio_final_to_reasoning": 1.5454770692666926, "adv/ratio_step_to_reasoning": 1.737919434766291, "adv/std_final_conf": 0.8564179539680481, "adv/std_reasoning": 0.7014631628990173, "adv/std_step_conf": 0.9355975985527039, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.662468112244898, "calib/avg_num_step_conf": 3.8046875, "calib/ece": 0.3673412698412697, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.876984126984127, "calib/gap": 0.11323214285714278, "calib/mean_conf": 0.9206746031746031, "calib/mu_c": 0.971, "calib/mu_w": 0.8577678571428572, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3662301587301586, "calib/std_conf": 0.20012481539267846, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5244223107569721, "calib/step_q_c_n": 502.0, "calib/step_q_gap": 0.06404095482476868, "calib/step_q_w": 0.4603813559322034, "calib/step_q_w_n": 472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2746.0, "completions/max_terminated_length": 2746.0, "completions/mean_length": 443.91015625, "completions/mean_terminated_length": 443.91015625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.058666666666666666, "grad_norm": 0.033687423914670944, "kl": 0.0462799072265625, "learning_rate": 4.027777777777779e-06, "loss": 0.0198, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03844766318798065, "mask/share_reasoning": 0.8608040809631348, "mask/share_step_conf": 0.10074827075004578, "num_tokens": 12939038.0, "reward": 0.3079288899898529, "reward_std": 0.21377216279506683, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6218097805976868, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": -0.311420738697052, "step": 55 }, { "adv/mean_abs_final_conf": 0.6719927787780762, "adv/mean_abs_reasoning": 0.5117623805999756, "adv/mean_abs_step_conf": 0.7626398801803589, "adv/ratio_final_to_reasoning": 1.3130953040945508, "adv/ratio_step_to_reasoning": 1.4902226288815166, "adv/std_final_conf": 0.8674094080924988, "adv/std_reasoning": 0.7752949595451355, "adv/std_step_conf": 0.9356803894042969, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6647956892680736, "calib/avg_num_step_conf": 4.23046875, "calib/ece": 0.42120000000000013, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9, "calib/gap": 0.0558291102700621, "calib/mean_conf": 0.9431200000000001, "calib/mu_c": 0.9696946564885496, "calib/mu_w": 0.9138655462184875, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4201600000000001, "calib/std_conf": 0.14516427108624216, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.48471666666666663, "calib/step_q_c_n": 600.0, "calib/step_q_gap": 0.019312939958592124, "calib/step_q_w": 0.4654037267080745, "calib/step_q_w_n": 483.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2375.0, "completions/max_terminated_length": 2375.0, "completions/mean_length": 444.25, "completions/mean_terminated_length": 447.7480163574219, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.05973333333333333, "grad_norm": 0.040319155901670456, "kl": 0.041660308837890625, "learning_rate": 4.000000000000001e-06, "loss": 0.0712, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03685273975133896, "mask/share_reasoning": 0.8518688678741455, "mask/share_step_conf": 0.10346592217683792, "num_tokens": 13159606.0, "reward": 0.27113640308380127, "reward_std": 0.19077391922473907, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.564152717590332, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": -0.31875497102737427, "step": 56 }, { "adv/mean_abs_final_conf": 0.6866100430488586, "adv/mean_abs_reasoning": 0.5079214572906494, "adv/mean_abs_step_conf": 0.753515362739563, "adv/ratio_final_to_reasoning": 1.351803577488867, "adv/ratio_step_to_reasoning": 1.483527328731018, "adv/std_final_conf": 0.8826867341995239, "adv/std_reasoning": 0.7755393981933594, "adv/std_step_conf": 0.9357887506484985, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6132120796156486, "calib/avg_num_step_conf": 3.859375, "calib/ece": 0.33000000000000024, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.8875502008032129, "calib/gap": 0.04507206588881285, "calib/mean_conf": 0.9297590361445784, "calib/mu_c": 0.9467741935483873, "calib/mu_w": 0.9017021276595745, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.31863453815261067, "calib/std_conf": 0.17179404805891435, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.5024758842443731, "calib/step_q_c_n": 622.0, "calib/step_q_gap": 0.04072725036459163, "calib/step_q_w": 0.46174863387978143, "calib/step_q_w_n": 366.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2568.0, "completions/max_terminated_length": 2568.0, "completions/mean_length": 451.40625, "completions/mean_terminated_length": 453.1764831542969, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.0608, "grad_norm": 0.02778824418783188, "kl": 0.039257049560546875, "learning_rate": 3.972222222222223e-06, "loss": 0.0263, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0373077429831028, "mask/share_reasoning": 0.8600698709487915, "mask/share_step_conf": 0.09871610999107361, "num_tokens": 13381958.0, "reward": 0.29894113540649414, "reward_std": 0.21296796202659607, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6251699328422546, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": -0.3366626501083374, "step": 57 }, { "adv/mean_abs_final_conf": 0.7430697679519653, "adv/mean_abs_reasoning": 0.5540591478347778, "adv/mean_abs_step_conf": 0.7561875581741333, "adv/ratio_final_to_reasoning": 1.3411379829316545, "adv/ratio_step_to_reasoning": 1.3648137768851183, "adv/std_final_conf": 0.9170951843261719, "adv/std_reasoning": 0.792834460735321, "adv/std_step_conf": 0.9358760714530945, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.49554195222569697, "calib/avg_num_step_conf": 4.5625, "calib/ece": 0.42117886178861785, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8577235772357723, "calib/gap": 0.0005303080710626906, "calib/mean_conf": 0.9112601626016261, "calib/mu_c": 0.9115037593984964, "calib/mu_w": 0.9109734513274337, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.3958943089430894, "calib/std_conf": 0.21098982125482804, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.4372477064220183, "calib/step_q_c_n": 654.0, "calib/step_q_gap": -0.0034137721772034557, "calib/step_q_w": 0.4406614785992218, "calib/step_q_w_n": 514.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2707.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 525.37109375, "completions/mean_terminated_length": 525.37109375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.06186666666666667, "grad_norm": 0.03578052669763565, "kl": 0.037464141845703125, "learning_rate": 3.944444444444445e-06, "loss": 0.013, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.0333830825984478, "mask/share_reasoning": 0.8662627935409546, "mask/share_step_conf": 0.10035405308008194, "num_tokens": 13622773.0, "reward": 0.2506124973297119, "reward_std": 0.24518176913261414, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5477023124694824, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": -0.3417898118495941, "step": 58 }, { "adv/mean_abs_final_conf": 0.6008204221725464, "adv/mean_abs_reasoning": 0.38778069615364075, "adv/mean_abs_step_conf": 0.7656729221343994, "adv/ratio_final_to_reasoning": 1.5493819783502019, "adv/ratio_step_to_reasoning": 1.9744998390302435, "adv/std_final_conf": 0.8101057410240173, "adv/std_reasoning": 0.6613629460334778, "adv/std_step_conf": 0.9356751441955566, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6365789473684211, "calib/avg_num_step_conf": 3.82421875, "calib/ece": 0.32876984126984143, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8571428571428571, "calib/gap": 0.10785789473684226, "calib/mean_conf": 0.910357142857143, "calib/mu_c": 0.9531578947368423, "calib/mu_w": 0.8453, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.31797619047619063, "calib/std_conf": 0.22871160756093376, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.520053003533569, "calib/step_q_c_n": 566.0, "calib/step_q_gap": 0.08189319723816946, "calib/step_q_w": 0.4381598062953995, "calib/step_q_w_n": 413.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2707.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 473.37890625, "completions/mean_terminated_length": 475.2353210449219, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.06293333333333333, "grad_norm": 0.051249004900455475, "kl": 0.043796539306640625, "learning_rate": 3.916666666666667e-06, "loss": -0.0535, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03806016966700554, "mask/share_reasoning": 0.8660639524459839, "mask/share_step_conf": 0.09196965396404266, "num_tokens": 13850206.0, "reward": 0.3142896294593811, "reward_std": 0.1821354180574417, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6513011455535889, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": -0.3375656306743622, "step": 59 }, { "adv/mean_abs_final_conf": 0.717840313911438, "adv/mean_abs_reasoning": 0.5838816165924072, "adv/mean_abs_step_conf": 0.7559605836868286, "adv/ratio_final_to_reasoning": 1.2294278386444626, "adv/ratio_step_to_reasoning": 1.294715507740579, "adv/std_final_conf": 0.8904332518577576, "adv/std_reasoning": 0.8266067504882812, "adv/std_step_conf": 0.9359144568443298, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6412456008044243, "calib/avg_num_step_conf": 3.63671875, "calib/ece": 0.3835177865612649, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8142292490118577, "calib/gap": 0.10164844142785334, "calib/mean_conf": 0.8696837944664033, "calib/mu_c": 0.9166911764705883, "calib/mu_w": 0.815042735042735, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.35782608695652185, "calib/std_conf": 0.2793214158881204, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5305689277899344, "calib/step_q_c_n": 457.0, "calib/step_q_gap": 0.09343812610217067, "calib/step_q_w": 0.4371308016877637, "calib/step_q_w_n": 474.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2209.0, "completions/max_terminated_length": 2209.0, "completions/mean_length": 444.31640625, "completions/mean_terminated_length": 446.058837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.064, "grad_norm": 0.037136998027563095, "kl": 0.04034423828125, "learning_rate": 3.88888888888889e-06, "loss": -0.017, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03838019073009491, "mask/share_reasoning": 0.8667376041412354, "mask/share_step_conf": 0.09097597002983093, "num_tokens": 14072807.0, "reward": 0.2943606972694397, "reward_std": 0.2698456645011902, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6016941070556641, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": -0.31375402212142944, "step": 60 }, { "adv/mean_abs_final_conf": 0.6191959381103516, "adv/mean_abs_reasoning": 0.4406328797340393, "adv/mean_abs_step_conf": 0.7638034820556641, "adv/ratio_final_to_reasoning": 1.4052422472060886, "adv/ratio_step_to_reasoning": 1.7334237120858675, "adv/std_final_conf": 0.8238195180892944, "adv/std_reasoning": 0.7014070153236389, "adv/std_step_conf": 0.935490071773529, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6044869131699211, "calib/avg_num_step_conf": 3.875, "calib/ece": 0.3202766798418971, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.924901185770751, "calib/gap": 0.023579144162858667, "calib/mean_conf": 0.9609881422924902, "calib/mu_c": 0.9690963855421687, "calib/mu_w": 0.94551724137931, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.31256916996047424, "calib/std_conf": 0.12019504840785379, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5196694214876033, "calib/step_q_c_n": 605.0, "calib/step_q_gap": 0.08757639823178937, "calib/step_q_w": 0.43209302325581395, "calib/step_q_w_n": 387.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2827.0, "completions/max_terminated_length": 2827.0, "completions/mean_length": 390.91015625, "completions/mean_terminated_length": 392.4431457519531, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.06506666666666666, "grad_norm": 0.03942104056477547, "kl": 0.047290802001953125, "learning_rate": 3.861111111111112e-06, "loss": -0.0588, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.043848633766174316, "mask/share_reasoning": 0.8465343713760376, "mask/share_step_conf": 0.10571075230836868, "num_tokens": 14276944.0, "reward": 0.3404572010040283, "reward_std": 0.21853400766849518, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6696093678474426, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": -0.3152574300765991, "step": 61 }, { "adv/mean_abs_final_conf": 0.6601003408432007, "adv/mean_abs_reasoning": 0.473049134016037, "adv/mean_abs_step_conf": 0.7604014277458191, "adv/ratio_final_to_reasoning": 1.3954160220930083, "adv/ratio_step_to_reasoning": 1.6074470347091694, "adv/std_final_conf": 0.8655326962471008, "adv/std_reasoning": 0.7394403219223022, "adv/std_step_conf": 0.9359850287437439, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.504105899356195, "calib/avg_num_step_conf": 3.796875, "calib/ece": 0.46506072874493926, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.8866396761133604, "calib/gap": -0.011488634870581937, "calib/mean_conf": 0.9114574898785426, "calib/mu_c": 0.905968992248062, "calib/mu_w": 0.917457627118644, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.4271255060728745, "calib/std_conf": 0.23152089388703107, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.4835079726651481, "calib/step_q_c_n": 439.0, "calib/step_q_gap": 0.05711022407227756, "calib/step_q_w": 0.42639774859287055, "calib/step_q_w_n": 533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2742.0, "completions/max_terminated_length": 2742.0, "completions/mean_length": 463.93359375, "completions/mean_terminated_length": 467.58660888671875, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.06613333333333334, "grad_norm": 0.0323968380689621, "kl": 0.046382904052734375, "learning_rate": 3.833333333333334e-06, "loss": -0.1007, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03665943443775177, "mask/share_reasoning": 0.8647003173828125, "mask/share_step_conf": 0.09082774817943573, "num_tokens": 14502791.0, "reward": 0.22184567153453827, "reward_std": 0.23442932963371277, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5089241862297058, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": -0.35585787892341614, "step": 62 }, { "adv/mean_abs_final_conf": 0.645679235458374, "adv/mean_abs_reasoning": 0.4358217418193817, "adv/mean_abs_step_conf": 0.7689712643623352, "adv/ratio_final_to_reasoning": 1.4815213962546272, "adv/ratio_step_to_reasoning": 1.7644169406330839, "adv/std_final_conf": 0.8309564590454102, "adv/std_reasoning": 0.720628559589386, "adv/std_step_conf": 0.9353896975517273, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6517218543046357, "calib/avg_num_step_conf": 3.75390625, "calib/ece": 0.3347675962815405, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.848605577689243, "calib/gap": 0.09549426048565113, "calib/mean_conf": 0.9161487383798143, "calib/mu_c": 0.954194260485651, "calib/mu_w": 0.8586999999999999, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.32466135458167333, "calib/std_conf": 0.19615684930194882, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5264946619217081, "calib/step_q_c_n": 562.0, "calib/step_q_gap": 0.1052415290896278, "calib/step_q_w": 0.4212531328320803, "calib/step_q_w_n": 399.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2094.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 482.83203125, "completions/mean_terminated_length": 482.83203125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.0672, "grad_norm": 0.039437055587768555, "kl": 0.040195465087890625, "learning_rate": 3.8055555555555556e-06, "loss": 0.008, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03730270266532898, "mask/share_reasoning": 0.8711056113243103, "mask/share_step_conf": 0.09159170091152191, "num_tokens": 14735036.0, "reward": 0.3450906574726105, "reward_std": 0.19561327993869781, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6513279676437378, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": -0.27364665269851685, "step": 63 }, { "adv/mean_abs_final_conf": 0.6538460850715637, "adv/mean_abs_reasoning": 0.47247079014778137, "adv/mean_abs_step_conf": 0.7776767015457153, "adv/ratio_final_to_reasoning": 1.3838867898416556, "adv/ratio_step_to_reasoning": 1.645978371070242, "adv/std_final_conf": 0.8448007702827454, "adv/std_reasoning": 0.7207430005073547, "adv/std_step_conf": 0.9357219338417053, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5599581339712919, "calib/avg_num_step_conf": 3.90625, "calib/ece": 0.2973809523809523, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8412698412698413, "calib/gap": 0.025242224880382746, "calib/mean_conf": 0.8846031746031746, "calib/mu_c": 0.8922159090909091, "calib/mu_w": 0.8669736842105263, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2417857142857142, "calib/std_conf": 0.26720229181619426, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4425141242937854, "calib/step_q_c_n": 708.0, "calib/step_q_gap": -0.023410533240461218, "calib/step_q_w": 0.4659246575342466, "calib/step_q_w_n": 292.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 429.77734375, "completions/mean_terminated_length": 431.4627685546875, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.06826666666666667, "grad_norm": 0.044037114828825, "kl": 0.04555511474609375, "learning_rate": 3.777777777777778e-06, "loss": 0.0171, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03862915188074112, "mask/share_reasoning": 0.8621880412101746, "mask/share_step_conf": 0.09527651220560074, "num_tokens": 14948835.0, "reward": 0.33689361810684204, "reward_std": 0.22495046257972717, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.6830945611000061, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": -0.34368225932121277, "step": 64 }, { "adv/mean_abs_final_conf": 0.5660860538482666, "adv/mean_abs_reasoning": 0.3165215253829956, "adv/mean_abs_step_conf": 0.7652155756950378, "adv/ratio_final_to_reasoning": 1.7884598943572458, "adv/ratio_step_to_reasoning": 2.4175783140470966, "adv/std_final_conf": 0.7545972466468811, "adv/std_reasoning": 0.5961599349975586, "adv/std_step_conf": 0.935614824295044, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6484870269916999, "calib/avg_num_step_conf": 3.40234375, "calib/ece": 0.40028000000000014, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.94, "calib/gap": 0.042424024573557206, "calib/mean_conf": 0.9443600000000001, "calib/mu_c": 0.9625174825174824, "calib/mu_w": 0.9200934579439252, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3863200000000001, "calib/std_conf": 0.18650627442528575, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5389227642276422, "calib/step_q_c_n": 492.0, "calib/step_q_gap": 0.04129743441233874, "calib/step_q_w": 0.49762532981530344, "calib/step_q_w_n": 379.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2430.0, "completions/max_terminated_length": 2430.0, "completions/mean_length": 371.63671875, "completions/mean_terminated_length": 371.63671875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.06933333333333333, "grad_norm": 0.03307357802987099, "kl": 0.0516510009765625, "learning_rate": 3.7500000000000005e-06, "loss": 0.0813, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.044516414403915405, "mask/share_reasoning": 0.8548356294631958, "mask/share_step_conf": 0.1006479412317276, "num_tokens": 15148998.0, "reward": 0.28827229142189026, "reward_std": 0.18606629967689514, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5883980393409729, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": -0.31810346245765686, "step": 65 }, { "adv/mean_abs_final_conf": 0.7129285335540771, "adv/mean_abs_reasoning": 0.49174657464027405, "adv/mean_abs_step_conf": 0.7922347187995911, "adv/ratio_final_to_reasoning": 1.449788509611081, "adv/ratio_step_to_reasoning": 1.6110630142754574, "adv/std_final_conf": 0.8913588523864746, "adv/std_reasoning": 0.7574918270111084, "adv/std_step_conf": 0.9358580112457275, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.627749417029054, "calib/avg_num_step_conf": 3.94140625, "calib/ece": 0.4444444444444445, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8452380952380952, "calib/gap": 0.05686519190773309, "calib/mean_conf": 0.9134920634920636, "calib/mu_c": 0.9426016260162603, "calib/mu_w": 0.8857364341085272, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.43492063492063493, "calib/std_conf": 0.21535064599546164, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5211084337349398, "calib/step_q_c_n": 415.0, "calib/step_q_gap": 0.15723637986288586, "calib/step_q_w": 0.3638720538720539, "calib/step_q_w_n": 594.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2373.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 471.4609375, "completions/mean_terminated_length": 475.1732177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.0704, "grad_norm": 0.04067553952336311, "kl": 0.049777984619140625, "learning_rate": 3.7222222222222225e-06, "loss": -0.0223, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03776799142360687, "mask/share_reasoning": 0.8647385835647583, "mask/share_step_conf": 0.08968096226453781, "num_tokens": 15376044.0, "reward": 0.25311705470085144, "reward_std": 0.2277534306049347, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.542452335357666, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": -0.3284057378768921, "step": 66 }, { "adv/mean_abs_final_conf": 0.5803280472755432, "adv/mean_abs_reasoning": 0.35292232036590576, "adv/mean_abs_step_conf": 0.7632925510406494, "adv/ratio_final_to_reasoning": 1.6443506510834052, "adv/ratio_step_to_reasoning": 2.1627777757135807, "adv/std_final_conf": 0.7728992700576782, "adv/std_reasoning": 0.6401668190956116, "adv/std_step_conf": 0.9356393814086914, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7155070754716982, "calib/avg_num_step_conf": 3.546875, "calib/ece": 0.3339607843137255, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8745098039215686, "calib/gap": 0.07929638364779867, "calib/mean_conf": 0.9309019607843138, "calib/mu_c": 0.960754716981132, "calib/mu_w": 0.8814583333333333, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.32066666666666666, "calib/std_conf": 0.1963000159721469, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.49503521126760563, "calib/step_q_c_n": 568.0, "calib/step_q_gap": 0.030358740679370333, "calib/step_q_w": 0.4646764705882353, "calib/step_q_w_n": 340.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2748.0, "completions/max_terminated_length": 2748.0, "completions/mean_length": 438.59375, "completions/mean_terminated_length": 438.59375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.07146666666666666, "grad_norm": 0.0408489964902401, "kl": 0.0499267578125, "learning_rate": 3.694444444444445e-06, "loss": 0.0181, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.039613641798496246, "mask/share_reasoning": 0.8700891733169556, "mask/share_step_conf": 0.09029721468687057, "num_tokens": 15593332.0, "reward": 0.34359416365623474, "reward_std": 0.1910526156425476, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6591253876686096, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": -0.2938120365142822, "step": 67 }, { "adv/mean_abs_final_conf": 0.60542231798172, "adv/mean_abs_reasoning": 0.42741620540618896, "adv/mean_abs_step_conf": 0.7589120864868164, "adv/ratio_final_to_reasoning": 1.4164702000626426, "adv/ratio_step_to_reasoning": 1.7755809837991403, "adv/std_final_conf": 0.8146088719367981, "adv/std_reasoning": 0.7204663753509521, "adv/std_step_conf": 0.9357614517211914, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7003043253043253, "calib/avg_num_step_conf": 3.64453125, "calib/ece": 0.3646613545816734, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9043824701195219, "calib/gap": 0.1351100751100751, "calib/mean_conf": 0.934382470119522, "calib/mu_c": 0.9925174825174826, "calib/mu_w": 0.8574074074074075, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3646613545816734, "calib/std_conf": 0.19353885392746306, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5307739307535642, "calib/step_q_c_n": 491.0, "calib/step_q_gap": 0.13072868188478587, "calib/step_q_w": 0.4000452488687783, "calib/step_q_w_n": 442.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2438.0, "completions/max_terminated_length": 2438.0, "completions/mean_length": 438.5625, "completions/mean_terminated_length": 438.5625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.07253333333333334, "grad_norm": 0.041994914412498474, "kl": 0.048595428466796875, "learning_rate": 3.6666666666666666e-06, "loss": 0.0268, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04383733496069908, "mask/share_reasoning": 0.8566344380378723, "mask/share_step_conf": 0.0995282307267189, "num_tokens": 15809692.0, "reward": 0.3205069899559021, "reward_std": 0.2103608250617981, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6379590034484863, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": -0.3047575354576111, "step": 68 }, { "adv/mean_abs_final_conf": 0.6797294616699219, "adv/mean_abs_reasoning": 0.49723154306411743, "adv/mean_abs_step_conf": 0.7593463659286499, "adv/ratio_final_to_reasoning": 1.3670280398568189, "adv/ratio_step_to_reasoning": 1.5271484211345239, "adv/std_final_conf": 0.862686276435852, "adv/std_reasoning": 0.7394139170646667, "adv/std_step_conf": 0.9360142350196838, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6129368126504848, "calib/avg_num_step_conf": 3.73046875, "calib/ece": 0.4212096774193548, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8024193548387096, "calib/gap": 0.06364807704821995, "calib/mean_conf": 0.8887096774193549, "calib/mu_c": 0.919763779527559, "calib/mu_w": 0.856115702479339, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3989112903225806, "calib/std_conf": 0.24823885396131076, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5252898550724638, "calib/step_q_c_n": 414.0, "calib/step_q_gap": 0.1472491896380831, "calib/step_q_w": 0.37804066543438075, "calib/step_q_w_n": 541.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3038.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 520.7265625, "completions/mean_terminated_length": 520.7265625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.0736, "grad_norm": 0.057562634348869324, "kl": 0.039585113525390625, "learning_rate": 3.638888888888889e-06, "loss": -0.0314, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03592987731099129, "mask/share_reasoning": 0.8848861455917358, "mask/share_step_conf": 0.07918399572372437, "num_tokens": 16047494.0, "reward": 0.2596679627895355, "reward_std": 0.2375001162290573, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5565113425254822, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": -0.3293628692626953, "step": 69 }, { "adv/mean_abs_final_conf": 0.685692548751831, "adv/mean_abs_reasoning": 0.4477848410606384, "adv/mean_abs_step_conf": 0.7801724672317505, "adv/ratio_final_to_reasoning": 1.531299155030966, "adv/ratio_step_to_reasoning": 1.7422931633500756, "adv/std_final_conf": 0.8694822788238525, "adv/std_reasoning": 0.7392827272415161, "adv/std_step_conf": 0.9357007145881653, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7277905785970301, "calib/avg_num_step_conf": 3.78125, "calib/ece": 0.34496000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.756, "calib/gap": 0.2317869943676395, "calib/mean_conf": 0.84424, "calib/mu_c": 0.9592063492063491, "calib/mu_w": 0.7274193548387096, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3426, "calib/std_conf": 0.30037114109048496, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4912448132780083, "calib/step_q_c_n": 482.0, "calib/step_q_gap": 0.09167691204344036, "calib/step_q_w": 0.3995679012345679, "calib/step_q_w_n": 486.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2957.0, "completions/max_terminated_length": 2957.0, "completions/mean_length": 482.65625, "completions/mean_terminated_length": 484.5490417480469, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.07466666666666667, "grad_norm": 0.05122093856334686, "kl": 0.045070648193359375, "learning_rate": 3.6111111111111115e-06, "loss": 0.0169, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03974894434213638, "mask/share_reasoning": 0.8590233325958252, "mask/share_step_conf": 0.09732148051261902, "num_tokens": 16278046.0, "reward": 0.3146213889122009, "reward_std": 0.2293742597103119, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6408277153968811, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": -0.3037724494934082, "step": 70 }, { "adv/mean_abs_final_conf": 0.698284924030304, "adv/mean_abs_reasoning": 0.48682349920272827, "adv/mean_abs_step_conf": 0.7600011825561523, "adv/ratio_final_to_reasoning": 1.4343697976245733, "adv/ratio_step_to_reasoning": 1.5611431736569983, "adv/std_final_conf": 0.8622894287109375, "adv/std_reasoning": 0.7394139170646667, "adv/std_step_conf": 0.9357407093048096, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.636729396495782, "calib/avg_num_step_conf": 4.40625, "calib/ece": 0.3746987951807229, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6987951807228916, "calib/gap": 0.09127190136275143, "calib/mean_conf": 0.8117269076305221, "calib/mu_c": 0.8538805970149254, "calib/mu_w": 0.7626086956521739, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.32413654618473897, "calib/std_conf": 0.31846618748173583, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.40028148148148146, "calib/step_q_c_n": 621.0, "calib/step_q_gap": 0.020833749726057338, "calib/step_q_w": 0.3794477317554241, "calib/step_q_w_n": 507.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 481.77734375, "completions/mean_terminated_length": 483.66668701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.07573333333333333, "grad_norm": 0.043013013899326324, "kl": 0.05849456787109375, "learning_rate": 3.5833333333333335e-06, "loss": 0.0153, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03930443152785301, "mask/share_reasoning": 0.8558655977249146, "mask/share_step_conf": 0.10092371702194214, "num_tokens": 16505789.0, "reward": 0.2700149714946747, "reward_std": 0.21516259014606476, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6017140746116638, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": -0.3593403697013855, "step": 71 }, { "adv/mean_abs_final_conf": 0.713843822479248, "adv/mean_abs_reasoning": 0.4635249078273773, "adv/mean_abs_step_conf": 0.776650071144104, "adv/ratio_final_to_reasoning": 1.5400333626625577, "adv/ratio_step_to_reasoning": 1.6755303933598722, "adv/std_final_conf": 0.8841903805732727, "adv/std_reasoning": 0.7392738461494446, "adv/std_step_conf": 0.9350262880325317, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7127551020408163, "calib/avg_num_step_conf": 3.61328125, "calib/ece": 0.28150277777777777, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.23904196428571434, "calib/mean_conf": 0.7761162698412698, "calib/mu_c": 0.8823571428571428, "calib/mu_w": 0.6433151785714285, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.251031746031746, "calib/std_conf": 0.3459573712344958, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5013888888888889, "calib/step_q_c_n": 504.0, "calib/step_q_gap": 0.09784969648983904, "calib/step_q_w": 0.40353919239904984, "calib/step_q_w_n": 421.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 411.609375, "completions/mean_terminated_length": 411.609375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.0768, "grad_norm": 2.196476697921753, "kl": 13.056953430175781, "learning_rate": 3.555555555555556e-06, "loss": 0.1237, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03887656331062317, "mask/share_reasoning": 0.8669606447219849, "mask/share_step_conf": 0.09416281431913376, "num_tokens": 16715569.0, "reward": 0.35540151596069336, "reward_std": 0.21558502316474915, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6877049803733826, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": -0.28158947825431824, "step": 72 }, { "adv/mean_abs_final_conf": 0.749143123626709, "adv/mean_abs_reasoning": 0.5188073515892029, "adv/mean_abs_step_conf": 0.7993726134300232, "adv/ratio_final_to_reasoning": 1.4439716810718757, "adv/ratio_step_to_reasoning": 1.5407889093733869, "adv/std_final_conf": 0.9182217717170715, "adv/std_reasoning": 0.7577370405197144, "adv/std_step_conf": 0.9359779357910156, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7088681849551414, "calib/avg_num_step_conf": 3.578125, "calib/ece": 0.2539442231075698, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6454183266932271, "calib/gap": 0.20489993098688764, "calib/mean_conf": 0.7803187250996017, "calib/mu_c": 0.8537888198757765, "calib/mu_w": 0.6488888888888888, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19641434262948212, "calib/std_conf": 0.3351979524678122, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5200347826086956, "calib/step_q_c_n": 575.0, "calib/step_q_gap": 0.06804064771133495, "calib/step_q_w": 0.4519941348973607, "calib/step_q_w_n": 341.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 438.71875, "completions/mean_terminated_length": 438.71875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.07786666666666667, "grad_norm": 0.06421296298503876, "kl": 0.0592193603515625, "learning_rate": 3.5277777777777784e-06, "loss": 0.0348, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03809599205851555, "mask/share_reasoning": 0.870836615562439, "mask/share_step_conf": 0.09106739610433578, "num_tokens": 16934913.0, "reward": 0.36335694789886475, "reward_std": 0.23348718881607056, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7183008193969727, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": -0.3134618401527405, "step": 73 }, { "adv/mean_abs_final_conf": 0.6810969114303589, "adv/mean_abs_reasoning": 0.49199649691581726, "adv/mean_abs_step_conf": 0.743833601474762, "adv/ratio_final_to_reasoning": 1.3843531726342708, "adv/ratio_step_to_reasoning": 1.5118676782002274, "adv/std_final_conf": 0.8765449523925781, "adv/std_reasoning": 0.7575379014015198, "adv/std_step_conf": 0.9211689233779907, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6781118786401191, "calib/avg_num_step_conf": 3.25390625, "calib/ece": 0.2613877551020408, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5387755102040817, "calib/gap": 0.23317215224163623, "calib/mean_conf": 0.6765714285714286, "calib/mu_c": 0.7784057971014493, "calib/mu_w": 0.5452336448598131, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.18734693877551023, "calib/std_conf": 0.3893307706559961, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5076681614349775, "calib/step_q_c_n": 446.0, "calib/step_q_gap": 0.11278444050474495, "calib/step_q_w": 0.39488372093023255, "calib/step_q_w_n": 387.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 415.296875, "completions/mean_terminated_length": 418.5669250488281, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.07893333333333333, "grad_norm": 0.034008484333753586, "kl": 0.0598907470703125, "learning_rate": 3.5e-06, "loss": -0.0063, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.04049193859100342, "mask/share_reasoning": 0.8622183203697205, "mask/share_step_conf": 0.08947721868753433, "num_tokens": 17145157.0, "reward": 0.33305859565734863, "reward_std": 0.20567968487739563, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6704207062721252, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": -0.301959753036499, "step": 74 }, { "adv/mean_abs_final_conf": 0.6795423030853271, "adv/mean_abs_reasoning": 0.4613600969314575, "adv/mean_abs_step_conf": 0.7674668431282043, "adv/ratio_final_to_reasoning": 1.4729108728843625, "adv/ratio_step_to_reasoning": 1.6634876926563156, "adv/std_final_conf": 0.8948934674263, "adv/std_reasoning": 0.7393792271614075, "adv/std_step_conf": 0.9358657002449036, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6866719872306465, "calib/avg_num_step_conf": 3.3671875, "calib/ece": 0.19417630522088364, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.2722870710295291, "calib/mean_conf": 0.7721690763052209, "calib/mu_c": 0.8487156424581005, "calib/mu_w": 0.5764285714285714, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12373493975903624, "calib/std_conf": 0.35158548802404604, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.558740635451505, "calib/step_q_c_n": 598.0, "calib/step_q_gap": 0.0989300293908989, "calib/step_q_w": 0.4598106060606061, "calib/step_q_w_n": 264.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1121.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 376.37109375, "completions/mean_terminated_length": 377.8470764160156, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.08, "grad_norm": 0.06710561364889145, "kl": 0.0687713623046875, "learning_rate": 3.4722222222222224e-06, "loss": -0.0058, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.042312610894441605, "mask/share_reasoning": 0.8570511341094971, "mask/share_step_conf": 0.09672995656728745, "num_tokens": 17346260.0, "reward": 0.41363397240638733, "reward_std": 0.2197515368461609, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.756236732006073, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": -0.26256251335144043, "step": 75 }, { "adv/mean_abs_final_conf": 0.7158702611923218, "adv/mean_abs_reasoning": 0.4958776831626892, "adv/mean_abs_step_conf": 0.772760808467865, "adv/ratio_final_to_reasoning": 1.4436428286639724, "adv/ratio_step_to_reasoning": 1.5583698051084405, "adv/std_final_conf": 0.8861145377159119, "adv/std_reasoning": 0.7576341032981873, "adv/std_step_conf": 0.9360941648483276, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.648254985754986, "calib/avg_num_step_conf": 3.0078125, "calib/ece": 0.2856097560975609, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6300813008130082, "calib/gap": 0.1681111111111111, "calib/mean_conf": 0.746829268292683, "calib/mu_c": 0.8083333333333333, "calib/mu_w": 0.6402222222222222, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.19914634146341456, "calib/std_conf": 0.36616961966127015, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5423432343234323, "calib/step_q_c_n": 505.0, "calib/step_q_gap": 0.08015191356871532, "calib/step_q_w": 0.462191320754717, "calib/step_q_w_n": 265.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2095.0, "completions/max_terminated_length": 2095.0, "completions/mean_length": 402.12890625, "completions/mean_terminated_length": 403.7059020996094, "completions/min_length": 0.0, "completions/min_terminated_length": 92.0, "epoch": 0.08106666666666666, "grad_norm": 0.03836711868643761, "kl": 0.0735931396484375, "learning_rate": 3.444444444444445e-06, "loss": -0.0136, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.04165346175432205, "mask/share_reasoning": 0.8721697330474854, "mask/share_step_conf": 0.08227050304412842, "num_tokens": 17552261.0, "reward": 0.33543461561203003, "reward_std": 0.2189868986606598, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6719093918800354, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": -0.3151026964187622, "step": 76 }, { "adv/mean_abs_final_conf": 0.7912461757659912, "adv/mean_abs_reasoning": 0.5600335597991943, "adv/mean_abs_step_conf": 0.763465404510498, "adv/ratio_final_to_reasoning": 1.4128549297111774, "adv/ratio_step_to_reasoning": 1.3632493823838812, "adv/std_final_conf": 0.9182538390159607, "adv/std_reasoning": 0.7931925058364868, "adv/std_step_conf": 0.9360113143920898, "calib/answer_extract_rate": 0.8828125, "calib/auroc": 0.6853621730382293, "calib/avg_num_step_conf": 3.203125, "calib/ece": 0.21491150442477885, "calib/final_conf_rate": 0.8828125, "calib/format_rate": 0.8828125, "calib/frac_conf_gt_0.9": 0.588495575221239, "calib/gap": 0.30158786049631114, "calib/mean_conf": 0.7150884955752213, "calib/mu_c": 0.8271830985915493, "calib/mu_w": 0.5255952380952381, "calib/nonempty_final_conf_rate": 0.8828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15084070796460186, "calib/std_conf": 0.3765989406252195, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.508991935483871, "calib/step_q_c_n": 496.0, "calib/step_q_gap": 0.06880675029868583, "calib/step_q_w": 0.4401851851851852, "calib/step_q_w_n": 324.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2587.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 419.59765625, "completions/mean_terminated_length": 421.2431640625, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.08213333333333334, "grad_norm": 0.04939044639468193, "kl": 0.07457733154296875, "learning_rate": 3.416666666666667e-06, "loss": -0.0774, "mask/has_final_conf_rate": 0.8828125, "mask/share_final_conf": 0.04175189882516861, "mask/share_reasoning": 0.8607625961303711, "mask/share_step_conf": 0.0935792326927185, "num_tokens": 17764342.0, "reward": 0.3393831253051758, "reward_std": 0.24981173872947693, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.669147253036499, "rewards/format_reward_step": 0.8828125, "rewards/step_l1_reward": -0.277880996465683, "step": 77 }, { "adv/mean_abs_final_conf": 0.7613974809646606, "adv/mean_abs_reasoning": 0.5785947442054749, "adv/mean_abs_step_conf": 0.7588104009628296, "adv/ratio_final_to_reasoning": 1.31594261543148, "adv/ratio_step_to_reasoning": 1.3114712993199178, "adv/std_final_conf": 0.9037981033325195, "adv/std_reasoning": 0.8100994229316711, "adv/std_step_conf": 0.9359738826751709, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.6475569544364509, "calib/avg_num_step_conf": 3.3125, "calib/ece": 0.27897787234042565, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.6212765957446809, "calib/gap": 0.17865948741007198, "calib/mean_conf": 0.7504689361702128, "calib/mu_c": 0.823453237410072, "calib/mu_w": 0.64479375, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.21897872340425545, "calib/std_conf": 0.34436838490837135, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.535946127946128, "calib/step_q_c_n": 495.0, "calib/step_q_gap": 0.10817275684131211, "calib/step_q_w": 0.42777337110481584, "calib/step_q_w_n": 353.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1923.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 443.90234375, "completions/mean_terminated_length": 445.6431579589844, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.0832, "grad_norm": 0.05135057866573334, "kl": 0.0748138427734375, "learning_rate": 3.3888888888888893e-06, "loss": 0.0086, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.035105351358652115, "mask/share_reasoning": 0.8789252042770386, "mask/share_step_conf": 0.08206315338611603, "num_tokens": 17986005.0, "reward": 0.3178112506866455, "reward_std": 0.2435327172279358, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6297679543495178, "rewards/format_reward_step": 0.8984375, "rewards/step_l1_reward": -0.2832079827785492, "step": 78 }, { "adv/mean_abs_final_conf": 0.8010854125022888, "adv/mean_abs_reasoning": 0.695340633392334, "adv/mean_abs_step_conf": 0.7600843906402588, "adv/ratio_final_to_reasoning": 1.1520762257112194, "adv/ratio_step_to_reasoning": 1.0931108497601552, "adv/std_final_conf": 0.9360950589179993, "adv/std_reasoning": 0.8907221555709839, "adv/std_step_conf": 0.9353445172309875, "calib/answer_extract_rate": 0.8828125, "calib/auroc": 0.6016976953989994, "calib/avg_num_step_conf": 3.15234375, "calib/ece": 0.28291371681415933, "calib/final_conf_rate": 0.8828125, "calib/format_rate": 0.8671875, "calib/frac_conf_gt_0.9": 0.5752212389380531, "calib/gap": 0.15731760026244557, "calib/mean_conf": 0.7022190265486726, "calib/mu_c": 0.7641715328467152, "calib/mu_w": 0.6068539325842697, "calib/nonempty_final_conf_rate": 0.8828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1894690265486726, "calib/std_conf": 0.38386494795215287, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5125550660792951, "calib/step_q_c_n": 454.0, "calib/step_q_gap": 0.026794914993365004, "calib/step_q_w": 0.4857601510859301, "calib/step_q_w_n": 353.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 431.9375, "completions/mean_terminated_length": 433.63140869140625, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.08426666666666667, "grad_norm": 0.051673173904418945, "kl": 0.08280181884765625, "learning_rate": 3.3611111111111117e-06, "loss": -0.1776, "mask/has_final_conf_rate": 0.87890625, "mask/share_final_conf": 0.03425498306751251, "mask/share_reasoning": 0.8841665387153625, "mask/share_step_conf": 0.07767222076654434, "num_tokens": 18202957.0, "reward": 0.29108428955078125, "reward_std": 0.263746976852417, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5915170907974243, "rewards/format_reward_step": 0.8671875, "rewards/step_l1_reward": -0.289817214012146, "step": 79 }, { "adv/mean_abs_final_conf": 0.746005654335022, "adv/mean_abs_reasoning": 0.6409313678741455, "adv/mean_abs_step_conf": 0.7787002921104431, "adv/ratio_final_to_reasoning": 1.1639399968976227, "adv/ratio_step_to_reasoning": 1.214951133837079, "adv/std_final_conf": 0.90828537940979, "adv/std_reasoning": 0.8435416221618652, "adv/std_step_conf": 0.9360520839691162, "calib/answer_extract_rate": 0.80078125, "calib/auroc": 0.7184604770352848, "calib/avg_num_step_conf": 3.47265625, "calib/ece": 0.3028407224958948, "calib/final_conf_rate": 0.79296875, "calib/format_rate": 0.78125, "calib/frac_conf_gt_0.9": 0.7389162561576355, "calib/gap": 0.2102201195873582, "calib/mean_conf": 0.8577175697865353, "calib/mu_c": 0.9498830409356728, "calib/mu_w": 0.7396629213483146, "calib/nonempty_final_conf_rate": 0.79296875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.29949096880131354, "calib/std_conf": 0.264480346069493, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5575022988505747, "calib/step_q_c_n": 435.0, "calib/step_q_gap": 0.09140097726467161, "calib/step_q_w": 0.46610132158590306, "calib/step_q_w_n": 454.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2102.0, "completions/max_terminated_length": 2102.0, "completions/mean_length": 392.23046875, "completions/mean_terminated_length": 392.23046875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.08533333333333333, "grad_norm": 0.07009822130203247, "kl": 0.0958251953125, "learning_rate": 3.3333333333333333e-06, "loss": -0.1183, "mask/has_final_conf_rate": 0.79296875, "mask/share_final_conf": 0.03364042192697525, "mask/share_reasoning": 0.8711358904838562, "mask/share_step_conf": 0.09522369503974915, "num_tokens": 18405528.0, "reward": 0.2661329507827759, "reward_std": 0.23599350452423096, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.5544984340667725, "rewards/format_reward_step": 0.78125, "rewards/step_l1_reward": -0.26910752058029175, "step": 80 }, { "adv/mean_abs_final_conf": 0.7740898728370667, "adv/mean_abs_reasoning": 0.7841185331344604, "adv/mean_abs_step_conf": 0.7297874093055725, "adv/ratio_final_to_reasoning": 0.9872102751387537, "adv/ratio_step_to_reasoning": 0.9307105730409113, "adv/std_final_conf": 0.9220060706138611, "adv/std_reasoning": 0.921212375164032, "adv/std_step_conf": 0.9199628829956055, "calib/answer_extract_rate": 0.55859375, "calib/auroc": 0.6680855481727574, "calib/avg_num_step_conf": 2.89453125, "calib/ece": 0.27380281690140845, "calib/final_conf_rate": 0.5546875, "calib/format_rate": 0.55078125, "calib/frac_conf_gt_0.9": 0.5915492957746479, "calib/gap": 0.17347591362126247, "calib/mean_conf": 0.7830985915492958, "calib/mu_c": 0.8515116279069768, "calib/mu_w": 0.6780357142857143, "calib/nonempty_final_conf_rate": 0.5546875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.22563380281690143, "calib/std_conf": 0.3208750867013859, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5628947368421053, "calib/step_q_c_n": 266.0, "calib/step_q_gap": 0.10815431578947377, "calib/step_q_w": 0.45474042105263157, "calib/step_q_w_n": 475.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 426.8046875, "completions/mean_terminated_length": 428.47845458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.0864, "grad_norm": 0.08157815784215927, "kl": 0.08916473388671875, "learning_rate": 3.3055555555555558e-06, "loss": -0.3737, "mask/has_final_conf_rate": 0.5546875, "mask/share_final_conf": 0.024003587663173676, "mask/share_reasoning": 0.8975480794906616, "mask/share_step_conf": 0.07454213500022888, "num_tokens": 18621038.0, "reward": 0.19895264506340027, "reward_std": 0.2581195831298828, "rewards/accuracy_reward_step": 0.33984375, "rewards/final_brier_reward_step": 0.39003515243530273, "rewards/format_reward_step": 0.55078125, "rewards/step_l1_reward": -0.17025485634803772, "step": 81 }, { "adv/mean_abs_final_conf": 0.7679874897003174, "adv/mean_abs_reasoning": 0.7872791886329651, "adv/mean_abs_step_conf": 0.7426701784133911, "adv/ratio_final_to_reasoning": 0.9754957336467309, "adv/ratio_step_to_reasoning": 0.9433377499829085, "adv/std_final_conf": 0.9068247675895691, "adv/std_reasoning": 0.9212319254875183, "adv/std_step_conf": 0.9205906987190247, "calib/answer_extract_rate": 0.46875, "calib/auroc": 0.7012411347517731, "calib/avg_num_step_conf": 2.69921875, "calib/ece": 0.29076722689075624, "calib/final_conf_rate": 0.46484375, "calib/format_rate": 0.45703125, "calib/frac_conf_gt_0.9": 0.680672268907563, "calib/gap": 0.2214624408983451, "calib/mean_conf": 0.7929302521008401, "calib/mu_c": 0.8803986111111111, "calib/mu_w": 0.658936170212766, "calib/nonempty_final_conf_rate": 0.46484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.23932773109243693, "calib/std_conf": 0.34122963399802164, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5550433884297522, "calib/step_q_c_n": 242.0, "calib/step_q_gap": 0.029007753685876936, "calib/step_q_w": 0.5260356347438753, "calib/step_q_w_n": 449.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 367.9296875, "completions/mean_terminated_length": 367.9296875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.08746666666666666, "grad_norm": 0.0639415830373764, "kl": 0.1104278564453125, "learning_rate": 3.277777777777778e-06, "loss": -0.5865, "mask/has_final_conf_rate": 0.46484375, "mask/share_final_conf": 0.021395236253738403, "mask/share_reasoning": 0.9011713266372681, "mask/share_step_conf": 0.07743342220783234, "num_tokens": 18820780.0, "reward": 0.16285640001296997, "reward_std": 0.24152183532714844, "rewards/accuracy_reward_step": 0.28125, "rewards/final_brier_reward_step": 0.32856854796409607, "rewards/format_reward_step": 0.45703125, "rewards/step_l1_reward": -0.1505119949579239, "step": 82 }, { "adv/mean_abs_final_conf": 0.5532522797584534, "adv/mean_abs_reasoning": 0.6258186101913452, "adv/mean_abs_step_conf": 0.6101732850074768, "adv/ratio_final_to_reasoning": 0.8840457454425898, "adv/ratio_step_to_reasoning": 0.9750002238203098, "adv/std_final_conf": 0.7766796350479126, "adv/std_reasoning": 0.8271476626396179, "adv/std_step_conf": 0.8270086646080017, "calib/answer_extract_rate": 0.32421875, "calib/auroc": 0.7380116959064328, "calib/avg_num_step_conf": 2.52734375, "calib/ece": 0.27225301204819274, "calib/final_conf_rate": 0.32421875, "calib/format_rate": 0.3203125, "calib/frac_conf_gt_0.9": 0.6024096385542169, "calib/gap": 0.28448245614035106, "calib/mean_conf": 0.7564216867469878, "calib/mu_c": 0.8866666666666666, "calib/mu_w": 0.6021842105263155, "calib/nonempty_final_conf_rate": 0.32421875, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.24325301204819277, "calib/std_conf": 0.35959224298701437, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.5001329729729729, "calib/step_q_c_n": 185.0, "calib/step_q_gap": -0.03067763308763316, "calib/step_q_w": 0.530810606060606, "calib/step_q_w_n": 462.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1511.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 405.5234375, "completions/mean_terminated_length": 407.1137390136719, "completions/min_length": 0.0, "completions/min_terminated_length": 73.0, "epoch": 0.08853333333333334, "grad_norm": 0.06789255887269974, "kl": 0.1085205078125, "learning_rate": 3.2500000000000002e-06, "loss": -0.5658, "mask/has_final_conf_rate": 0.32421875, "mask/share_final_conf": 0.014378046616911888, "mask/share_reasoning": 0.9094371795654297, "mask/share_step_conf": 0.07227854430675507, "num_tokens": 19031858.0, "reward": 0.12394683808088303, "reward_std": 0.16958144307136536, "rewards/accuracy_reward_step": 0.17578125, "rewards/final_brier_reward_step": 0.2288168966770172, "rewards/format_reward_step": 0.3203125, "rewards/step_l1_reward": -0.08014197647571564, "step": 83 }, { "adv/mean_abs_final_conf": 0.40271973609924316, "adv/mean_abs_reasoning": 0.4289896786212921, "adv/mean_abs_step_conf": 0.4000370502471924, "adv/ratio_final_to_reasoning": 0.9387632294406789, "adv/ratio_step_to_reasoning": 0.93250973201232, "adv/std_final_conf": 0.7011737823486328, "adv/std_reasoning": 0.7209097743034363, "adv/std_step_conf": 0.7012189626693726, "calib/answer_extract_rate": 0.13671875, "calib/auroc": 0.8480392156862745, "calib/avg_num_step_conf": 1.84765625, "calib/ece": 0.3411428571428571, "calib/final_conf_rate": 0.13671875, "calib/format_rate": 0.1328125, "calib/frac_conf_gt_0.9": 0.42857142857142855, "calib/gap": 0.30251633986928106, "calib/mean_conf": 0.6897142857142857, "calib/mu_c": 0.8452941176470589, "calib/mu_w": 0.5427777777777778, "calib/nonempty_final_conf_rate": 0.13671875, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.2725714285714285, "calib/std_conf": 0.36861505832574143, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.4917241379310345, "calib/step_q_c_n": 58.0, "calib/step_q_gap": -0.11404293034205787, "calib/step_q_w": 0.6057670682730923, "calib/step_q_w_n": 415.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 315.5, "completions/mean_terminated_length": 316.7372741699219, "completions/min_length": 0.0, "completions/min_terminated_length": 78.0, "epoch": 0.0896, "grad_norm": 0.10951647907495499, "kl": 0.1318359375, "learning_rate": 3.2222222222222227e-06, "loss": -0.9894, "mask/has_final_conf_rate": 0.13671875, "mask/share_final_conf": 0.007692576386034489, "mask/share_reasoning": 0.9166797995567322, "mask/share_step_conf": 0.0717214047908783, "num_tokens": 19218546.0, "reward": 0.047167807817459106, "reward_std": 0.09039464592933655, "rewards/accuracy_reward_step": 0.06640625, "rewards/final_brier_reward_step": 0.09505703300237656, "rewards/format_reward_step": 0.1328125, "rewards/step_l1_reward": -0.04056517034769058, "step": 84 }, { "adv/mean_abs_final_conf": 0.22187048196792603, "adv/mean_abs_reasoning": 0.25201520323753357, "adv/mean_abs_step_conf": 0.24062125384807587, "adv/ratio_final_to_reasoning": 0.8803853065912256, "adv/ratio_step_to_reasoning": 0.9547886427362936, "adv/std_final_conf": 0.5468955039978027, "adv/std_reasoning": 0.5484980940818787, "adv/std_step_conf": 0.5483436584472656, "calib/answer_extract_rate": 0.08984375, "calib/auroc": 0.6470588235294117, "calib/avg_num_step_conf": 1.86328125, "calib/ece": 0.5965217391304346, "calib/final_conf_rate": 0.08984375, "calib/format_rate": 0.0859375, "calib/frac_conf_gt_0.9": 0.7391304347826086, "calib/gap": 0.1454901960784314, "calib/mean_conf": 0.8191304347826085, "calib/mu_c": 0.9266666666666666, "calib/mu_w": 0.7811764705882352, "calib/nonempty_final_conf_rate": 0.08984375, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.5773913043478259, "calib/std_conf": 0.3387176975579242, "calib/step_conf_rate": 0.92578125, "calib/step_q_c": 0.5032000000000001, "calib/step_q_c_n": 25.0, "calib/step_q_gap": -0.0639567109144541, "calib/step_q_w": 0.5671567109144542, "calib/step_q_w_n": 452.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2447.0, "completions/max_terminated_length": 2447.0, "completions/mean_length": 329.05078125, "completions/mean_terminated_length": 329.05078125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.09066666666666667, "grad_norm": 0.09789498895406723, "kl": 0.1246795654296875, "learning_rate": 3.1944444444444443e-06, "loss": -0.5054, "mask/has_final_conf_rate": 0.08984375, "mask/share_final_conf": 0.005012996960431337, "mask/share_reasoning": 0.9164111614227295, "mask/share_step_conf": 0.07857586443424225, "num_tokens": 19410607.0, "reward": 0.015317767858505249, "reward_std": 0.0544668585062027, "rewards/accuracy_reward_step": 0.0234375, "rewards/final_brier_reward_step": 0.035423435270786285, "rewards/format_reward_step": 0.0859375, "rewards/step_l1_reward": -0.026662901043891907, "step": 85 }, { "adv/mean_abs_final_conf": 0.08323132991790771, "adv/mean_abs_reasoning": 0.08747018873691559, "adv/mean_abs_step_conf": 0.08138076215982437, "adv/ratio_final_to_reasoning": 0.9515393886738132, "adv/ratio_step_to_reasoning": 0.9303828348260867, "adv/std_final_conf": 0.33108845353126526, "adv/std_reasoning": 0.3308155834674835, "adv/std_step_conf": 0.33072537183761597, "calib/answer_extract_rate": 0.0234375, "calib/auroc": 1.0, "calib/avg_num_step_conf": 1.5703125, "calib/ece": 0.3833333333333333, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.48400000000000004, "calib/mean_conf": 0.47333333333333333, "calib/mu_c": 0.554, "calib/mu_w": 0.07, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.011666666666666667, "calib/std_conf": 0.2590152290674988, "calib/step_conf_rate": 0.91796875, "calib/step_q_c": 0.32419354838709674, "calib/step_q_c_n": 31.0, "calib/step_q_gap": -0.26582531953743155, "calib/step_q_w": 0.5900188679245283, "calib/step_q_w_n": 371.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 309.203125, "completions/mean_terminated_length": 311.6377868652344, "completions/min_length": 0.0, "completions/min_terminated_length": 44.0, "epoch": 0.09173333333333333, "grad_norm": 0.07561160624027252, "kl": 0.138153076171875, "learning_rate": 3.1666666666666667e-06, "loss": -0.2178, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0008216683054342866, "mask/share_reasoning": 0.9200811386108398, "mask/share_step_conf": 0.07128473371267319, "num_tokens": 19595275.0, "reward": 0.008734840899705887, "reward_std": 0.01978309080004692, "rewards/accuracy_reward_step": 0.01953125, "rewards/final_brier_reward_step": 0.014989453367888927, "rewards/format_reward_step": 0.01953125, "rewards/step_l1_reward": -0.005332270171493292, "step": 86 }, { "adv/mean_abs_final_conf": 0.057972054928541183, "adv/mean_abs_reasoning": 0.057895708829164505, "adv/mean_abs_step_conf": 0.05789738893508911, "adv/ratio_final_to_reasoning": 1.0013186832136722, "adv/ratio_step_to_reasoning": 1.00002901952422, "adv/std_final_conf": 0.2868097424507141, "adv/std_reasoning": 0.2864321172237396, "adv/std_step_conf": 0.2864404320716858, "calib/answer_extract_rate": 0.01171875, "calib/auroc": 1.0, "calib/avg_num_step_conf": 1.26953125, "calib/ece": 0.010000000000000009, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.985, "calib/mean_conf": 0.6566666666666666, "calib/mu_c": 0.985, "calib/mu_w": 0.0, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.0, "calib/std_conf": 0.46435139950496784, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.32222222222222224, "calib/step_q_c_n": 9.0, "calib/step_q_gap": -0.3608895921237693, "calib/step_q_w": 0.6831118143459916, "calib/step_q_w_n": 316.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2022.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 243.0, "completions/mean_terminated_length": 243.9529571533203, "completions/min_length": 0.0, "completions/min_terminated_length": 33.0, "epoch": 0.0928, "grad_norm": 0.03810049220919609, "kl": 0.1692962646484375, "learning_rate": 3.138888888888889e-06, "loss": -0.1218, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0005845353007316589, "mask/share_reasoning": 0.9222879409790039, "mask/share_step_conf": 0.0732213482260704, "num_tokens": 19762979.0, "reward": 0.005352574400603771, "reward_std": 0.015139367431402206, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.011716797016561031, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.004917897284030914, "step": 87 }, { "adv/mean_abs_final_conf": 0.02810167334973812, "adv/mean_abs_reasoning": 0.02824888564646244, "adv/mean_abs_step_conf": 0.027679258957505226, "adv/ratio_final_to_reasoning": 0.9947887396845775, "adv/ratio_step_to_reasoning": 0.979835427985155, "adv/std_final_conf": 0.16561469435691833, "adv/std_reasoning": 0.1653638482093811, "adv/std_step_conf": 0.1653398722410202, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 1.2890625, "calib/ece": 0.20333333333333334, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.20333333333333334, "calib/mu_c": NaN, "calib/mu_w": 0.20333333333333334, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.20333333333333334, "calib/std_conf": 0.18116904322268257, "calib/step_conf_rate": 0.921875, "calib/step_q_w": 0.647889797979798, "calib/step_q_w_n": 330.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 313.7265625, "completions/mean_terminated_length": 313.7265625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.09386666666666667, "grad_norm": 0.02372133545577526, "kl": 0.135498046875, "learning_rate": 3.1111111111111116e-06, "loss": -0.0887, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0003868588828481734, "mask/share_reasoning": 0.9381458163261414, "mask/share_step_conf": 0.061467334628105164, "num_tokens": 19953141.0, "reward": 0.005528590641915798, "reward_std": 0.007678534835577011, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.010849609039723873, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.002136178081855178, "step": 88 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.97265625, "calib/ece": 0.95, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.95, "calib/mu_c": NaN, "calib/mu_w": 0.95, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.9453125, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.95, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.94140625, "calib/step_q_w": 0.712595983935743, "calib/step_q_w_n": 249.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 292.59375, "completions/mean_terminated_length": 294.89764404296875, "completions/min_length": 0.0, "completions/min_terminated_length": 30.0, "epoch": 0.09493333333333333, "grad_norm": 0.0020648923236876726, "kl": 0.142333984375, "learning_rate": 3.0833333333333336e-06, "loss": 0.0182, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 6.620762724196538e-05, "mask/share_reasoning": 0.9293363094329834, "mask/share_step_conf": 0.06278496980667114, "num_tokens": 20136933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 89 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 1.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.94921875, "calib/nonempty_step_conf_rate": 0.94921875, "calib/step_conf_rate": 0.94921875, "calib/step_q_w": 0.6169270833333333, "calib/step_q_w_n": 256.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1303.0, "completions/max_terminated_length": 1303.0, "completions/mean_length": 252.015625, "completions/mean_terminated_length": 253.00393676757812, "completions/min_length": 0.0, "completions/min_terminated_length": 28.0, "epoch": 0.096, "grad_norm": 0.001729490701109171, "kl": 0.1618499755859375, "learning_rate": 3.055555555555556e-06, "loss": 0.0186, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9174002408981323, "mask/share_step_conf": 0.07869353890419006, "num_tokens": 20304769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 90 }, { "adv/mean_abs_final_conf": 0.01932401768863201, "adv/mean_abs_reasoning": 0.03858806565403938, "adv/mean_abs_step_conf": 0.01931961625814438, "adv/ratio_final_to_reasoning": 0.5007770501346491, "adv/ratio_step_to_reasoning": 0.5006629881724068, "adv/std_final_conf": 0.16558969020843506, "adv/std_reasoning": 0.23381583392620087, "adv/std_step_conf": 0.16555197536945343, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.9765625, "calib/ece": 0.0050000000000000044, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.995, "calib/mu_c": 0.995, "calib/mu_w": NaN, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.0, "calib/std_conf": 0.0050000000000000044, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.28, "calib/step_q_c_n": 1.0, "calib/step_q_gap": -0.3844979919678715, "calib/step_q_w": 0.6644979919678715, "calib/step_q_w_n": 249.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2736.0, "completions/max_terminated_length": 2736.0, "completions/mean_length": 262.5859375, "completions/mean_terminated_length": 263.6156921386719, "completions/min_length": 0.0, "completions/min_terminated_length": 32.0, "epoch": 0.09706666666666666, "grad_norm": 0.027071869000792503, "kl": 0.1643524169921875, "learning_rate": 3.0277777777777776e-06, "loss": -0.0465, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.00029008882120251656, "mask/share_reasoning": 0.9296793341636658, "mask/share_step_conf": 0.06612434983253479, "num_tokens": 20479703.0, "reward": 0.001732663600705564, "reward_std": 0.0049007125198841095, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.003905859310179949, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.0027842819690704346, "step": 91 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.96484375, "calib/ece": 0.31000000000000005, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.69, "calib/mu_c": 0.69, "calib/mu_w": NaN, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.95703125, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.0, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.953125, "calib/step_q_w": 0.7439919028340082, "calib/step_q_w_n": 247.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2197.0, "completions/max_terminated_length": 2197.0, "completions/mean_length": 253.13671875, "completions/mean_terminated_length": 253.13671875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.09813333333333334, "grad_norm": 0.0024913293309509754, "kl": 0.16802978515625, "learning_rate": 3e-06, "loss": 0.0157, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.000139865733217448, "mask/share_reasoning": 0.9277439117431641, "mask/share_step_conf": 0.0721161812543869, "num_tokens": 20651226.0, "reward": 0.0003906250058207661, "reward_std": 0.001104854280129075, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 92 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.921875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.921875, "calib/step_conf_rate": 0.921875, "calib/step_q_w": 0.6648535310734464, "calib/step_q_w_n": 236.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 252.34375, "completions/mean_terminated_length": 252.34375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0992, "grad_norm": 0.0017887783469632268, "kl": 0.1580810546875, "learning_rate": 2.9722222222222225e-06, "loss": 0.0193, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9293789863586426, "mask/share_step_conf": 0.07062099128961563, "num_tokens": 20821602.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 93 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.9296875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.9296875, "calib/step_conf_rate": 0.9296875, "calib/step_q_w": 0.6646086834733892, "calib/step_q_w_n": 238.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2464.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 262.765625, "completions/mean_terminated_length": 262.765625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.10026666666666667, "grad_norm": 0.00258839363232255, "kl": 0.1856231689453125, "learning_rate": 2.944444444444445e-06, "loss": 0.0206, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.930269181728363, "mask/share_step_conf": 0.06973081827163696, "num_tokens": 20997550.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 94 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.89453125, "calib/ece": 0.64, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.5, "calib/mean_conf": 0.64, "calib/mu_c": NaN, "calib/mu_w": 0.64, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.90234375, "calib/nonempty_step_conf_rate": 0.89453125, "calib/pce": 0.64, "calib/std_conf": 0.34, "calib/step_conf_rate": 0.89453125, "calib/step_q_w": 0.6451854439592429, "calib/step_q_w_n": 229.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3054.0, "completions/max_terminated_length": 3054.0, "completions/mean_length": 277.64453125, "completions/mean_terminated_length": 278.73333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 27.0, "epoch": 0.10133333333333333, "grad_norm": 0.002397137461230159, "kl": 0.14752960205078125, "learning_rate": 2.916666666666667e-06, "loss": 0.0195, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.00020610095816664398, "mask/share_reasoning": 0.920891284942627, "mask/share_step_conf": 0.07499632984399796, "num_tokens": 21174755.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 95 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.921875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.92578125, "calib/nonempty_step_conf_rate": 0.921875, "calib/step_conf_rate": 0.921875, "calib/step_q_w": 0.685409604519774, "calib/step_q_w_n": 236.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 244.8984375, "completions/mean_terminated_length": 244.8984375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.1024, "grad_norm": 0.0031177501659840345, "kl": 0.1843109130859375, "learning_rate": 2.888888888888889e-06, "loss": 0.0152, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9352742433547974, "mask/share_step_conf": 0.06472573429346085, "num_tokens": 21343265.0, "reward": 0.0003906250058207661, "reward_std": 0.001104854280129075, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 96 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.94140625, "calib/ece": 0.9, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.9, "calib/mu_c": NaN, "calib/mu_w": 0.9, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.9, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.91796875, "calib/step_q_w": 0.6480928077455048, "calib/step_q_w_n": 241.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2585.0, "completions/max_terminated_length": 2585.0, "completions/mean_length": 265.30859375, "completions/mean_terminated_length": 265.30859375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.10346666666666667, "grad_norm": 0.002110776724293828, "kl": 0.1783294677734375, "learning_rate": 2.861111111111111e-06, "loss": 0.0197, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 8.387653360841796e-05, "mask/share_reasoning": 0.9345265030860901, "mask/share_step_conf": 0.06538967043161392, "num_tokens": 21516256.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 97 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.88671875, "calib/ece": 0.0, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.0, "calib/mu_c": NaN, "calib/mu_w": 0.0, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.89453125, "calib/nonempty_step_conf_rate": 0.88671875, "calib/pce": 0.0, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.88671875, "calib/step_q_w": 0.6609838472834068, "calib/step_q_w_n": 227.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1131.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 265.10546875, "completions/mean_terminated_length": 268.2490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 31.0, "epoch": 0.10453333333333334, "grad_norm": 0.0032506384886801243, "kl": 0.165618896484375, "learning_rate": 2.8333333333333335e-06, "loss": 0.0207, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.0001435853773728013, "mask/share_reasoning": 0.9282910823822021, "mask/share_step_conf": 0.05984655022621155, "num_tokens": 21690307.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 98 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.87109375, "calib/ece": 0.6261333333333332, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/mean_conf": 0.6261333333333333, "calib/mu_c": NaN, "calib/mu_w": 0.6261333333333333, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.87890625, "calib/nonempty_step_conf_rate": 0.87109375, "calib/pce": 0.6261333333333332, "calib/std_conf": 0.4335165151281885, "calib/step_conf_rate": 0.87109375, "calib/step_q_w": 0.6548052316890881, "calib/step_q_w_n": 223.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2595.0, "completions/max_terminated_length": 2595.0, "completions/mean_length": 307.17578125, "completions/mean_terminated_length": 308.3804016113281, "completions/min_length": 0.0, "completions/min_terminated_length": 26.0, "epoch": 0.1056, "grad_norm": 0.0021076835691928864, "kl": 0.1526947021484375, "learning_rate": 2.805555555555556e-06, "loss": 0.0201, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0004948556306771934, "mask/share_reasoning": 0.9374754428863525, "mask/share_step_conf": 0.05812348425388336, "num_tokens": 21874744.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 99 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.88671875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.88671875, "calib/nonempty_step_conf_rate": 0.88671875, "calib/step_conf_rate": 0.88671875, "calib/step_q_w": 0.6474894273127755, "calib/step_q_w_n": 227.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2829.0, "completions/max_terminated_length": 2829.0, "completions/mean_length": 283.796875, "completions/mean_terminated_length": 283.796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.10666666666666667, "grad_norm": 0.003983226139098406, "kl": 0.1564483642578125, "learning_rate": 2.7777777777777783e-06, "loss": 0.0206, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9387972354888916, "mask/share_step_conf": 0.06120274215936661, "num_tokens": 22054804.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 100 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.890625, "calib/ece": 0.12, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.12, "calib/mu_c": NaN, "calib/mu_w": 0.12, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.89453125, "calib/nonempty_step_conf_rate": 0.890625, "calib/pce": 0.12, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.890625, "calib/step_q_w": 0.7245760233918128, "calib/step_q_w_n": 228.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2346.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 327.859375, "completions/mean_terminated_length": 327.859375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.10773333333333333, "grad_norm": 0.0020883805118501186, "kl": 0.144989013671875, "learning_rate": 2.7500000000000004e-06, "loss": 0.0196, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 8.778089977568015e-05, "mask/share_reasoning": 0.9369585514068604, "mask/share_step_conf": 0.0629536435008049, "num_tokens": 22245728.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 101 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.91015625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.9140625, "calib/nonempty_step_conf_rate": 0.91015625, "calib/step_conf_rate": 0.91015625, "calib/step_q_w": 0.6792072961373391, "calib/step_q_w_n": 233.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1122.0, "completions/max_terminated_length": 1122.0, "completions/mean_length": 224.41796875, "completions/mean_terminated_length": 224.41796875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.1088, "grad_norm": 0.0024927027989178896, "kl": 0.187957763671875, "learning_rate": 2.7222222222222224e-06, "loss": 0.0142, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9262990951538086, "mask/share_step_conf": 0.0737009346485138, "num_tokens": 22409875.0, "reward": 0.0003906250058207661, "reward_std": 0.001104854280129075, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 102 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.90234375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.90234375, "calib/nonempty_step_conf_rate": 0.90234375, "calib/step_conf_rate": 0.90234375, "calib/step_q_w": 0.6472352092352092, "calib/step_q_w_n": 231.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1993.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 315.05859375, "completions/mean_terminated_length": 315.05859375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.10986666666666667, "grad_norm": 0.002758257556706667, "kl": 0.150848388671875, "learning_rate": 2.6944444444444444e-06, "loss": 0.0194, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9427634477615356, "mask/share_step_conf": 0.05723656713962555, "num_tokens": 22595082.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 103 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.8828125, "calib/ece": 0.015, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.97, "calib/mean_conf": 0.515, "calib/mu_c": 1.0, "calib/mu_w": 0.03, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.890625, "calib/nonempty_step_conf_rate": 0.8828125, "calib/pce": 0.015, "calib/std_conf": 0.485, "calib/step_conf_rate": 0.8828125, "calib/step_q_w": 0.6190050147492626, "calib/step_q_w_n": 226.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2761.0, "completions/max_terminated_length": 2761.0, "completions/mean_length": 265.98828125, "completions/mean_terminated_length": 265.98828125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.11093333333333333, "grad_norm": 0.0026147959288209677, "kl": 0.17083740234375, "learning_rate": 2.666666666666667e-06, "loss": 0.009, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.00016867897647898644, "mask/share_reasoning": 0.9309343099594116, "mask/share_step_conf": 0.06889700889587402, "num_tokens": 22769855.0, "reward": 0.0003906250058207661, "reward_std": 0.001104854280129075, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 104 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.038560837507247925, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.233650803565979, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/auroc": 0.25, "calib/avg_num_step_conf": 0.84765625, "calib/ece": 0.33000000000000007, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0050000000000000044, "calib/mean_conf": 0.9966666666666667, "calib/mu_c": 0.995, "calib/mu_w": 1.0, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.859375, "calib/nonempty_step_conf_rate": 0.84765625, "calib/pce": 0.33000000000000007, "calib/std_conf": 0.004714045207910321, "calib/step_conf_rate": 0.84765625, "calib/step_q_w": 0.6160026113671274, "calib/step_q_w_n": 217.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2069.0, "completions/max_terminated_length": 2069.0, "completions/mean_length": 307.92578125, "completions/mean_terminated_length": 309.13336181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 29.0, "epoch": 0.112, "grad_norm": 0.01371827907860279, "kl": 0.16375732421875, "learning_rate": 2.6388888888888893e-06, "loss": 0.0188, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0005875006900168955, "mask/share_reasoning": 0.938085675239563, "mask/share_step_conf": 0.05742061138153076, "num_tokens": 22954444.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 105 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.93359375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.93359375, "calib/nonempty_step_conf_rate": 0.93359375, "calib/step_conf_rate": 0.93359375, "calib/step_q_w": 0.6228207810320782, "calib/step_q_w_n": 239.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 256.625, "completions/mean_terminated_length": 256.625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.11306666666666666, "grad_norm": 0.0019686499144881964, "kl": 0.1787109375, "learning_rate": 2.6111111111111113e-06, "loss": 0.0197, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9375455975532532, "mask/share_step_conf": 0.06245441734790802, "num_tokens": 23124724.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 106 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.91796875, "calib/ece": 1.0, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.0, "calib/mu_c": 0.0, "calib/mu_w": NaN, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.0, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.91796875, "calib/step_q_w": 0.6645486524822696, "calib/step_q_w_n": 235.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1825.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 262.71875, "completions/mean_terminated_length": 262.71875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.11413333333333334, "grad_norm": 0.002731727436184883, "kl": 0.17803955078125, "learning_rate": 2.5833333333333337e-06, "loss": 0.0128, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 5.9964364481857046e-05, "mask/share_reasoning": 0.930545449256897, "mask/share_step_conf": 0.06939459592103958, "num_tokens": 23296596.0, "reward": 0.0003906250058207661, "reward_std": 0.001104854280129075, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 107 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.89453125, "calib/ece": 0.0, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.0, "calib/mu_c": NaN, "calib/mu_w": 0.0, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.89453125, "calib/nonempty_step_conf_rate": 0.89453125, "calib/pce": 0.0, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.89453125, "calib/step_q_w": 0.6813867540029113, "calib/step_q_w_n": 229.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1382.0, "completions/max_terminated_length": 1382.0, "completions/mean_length": 279.796875, "completions/mean_terminated_length": 280.8941345214844, "completions/min_length": 0.0, "completions/min_terminated_length": 37.0, "epoch": 0.1152, "grad_norm": 0.002479060785844922, "kl": 0.161041259765625, "learning_rate": 2.5555555555555557e-06, "loss": 0.0199, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.00010872266284422949, "mask/share_reasoning": 0.9320110082626343, "mask/share_step_conf": 0.06397401541471481, "num_tokens": 23471456.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 108 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.9375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.93359375, "calib/nonempty_step_conf_rate": 0.93359375, "calib/step_conf_rate": 0.93359375, "calib/step_q_w": 0.6219333333333333, "calib/step_q_w_n": 240.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2597.0, "completions/max_terminated_length": 2597.0, "completions/mean_length": 295.53515625, "completions/mean_terminated_length": 295.53515625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.11626666666666667, "grad_norm": 0.0018036911496892571, "kl": 0.159149169921875, "learning_rate": 2.5277777777777778e-06, "loss": 0.0202, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9353963136672974, "mask/share_step_conf": 0.06460371613502502, "num_tokens": 23651713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 109 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.9453125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.9453125, "calib/nonempty_step_conf_rate": 0.9453125, "calib/step_conf_rate": 0.9453125, "calib/step_q_w": 0.7107644628099173, "calib/step_q_w_n": 242.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2162.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 246.1640625, "completions/mean_terminated_length": 246.1640625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.11733333333333333, "grad_norm": 0.002016980666667223, "kl": 0.1808013916015625, "learning_rate": 2.5e-06, "loss": 0.0193, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9226292371749878, "mask/share_step_conf": 0.07737080752849579, "num_tokens": 23819651.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 110 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.90625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.90625, "calib/nonempty_step_conf_rate": 0.90625, "calib/step_conf_rate": 0.90625, "calib/step_q_w": 0.6728936781609195, "calib/step_q_w_n": 232.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2647.0, "completions/max_terminated_length": 2647.0, "completions/mean_length": 301.8828125, "completions/mean_terminated_length": 301.8828125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.1184, "grad_norm": 0.0021393022034317255, "kl": 0.1673126220703125, "learning_rate": 2.4722222222222226e-06, "loss": 0.0199, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9377536177635193, "mask/share_step_conf": 0.06224638968706131, "num_tokens": 24004341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 111 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.87890625, "calib/ece": 0.035, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.9299999999999999, "calib/mean_conf": 0.535, "calib/mu_c": 1.0, "calib/mu_w": 0.07, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.890625, "calib/nonempty_step_conf_rate": 0.87890625, "calib/pce": 0.035, "calib/std_conf": 0.465, "calib/step_conf_rate": 0.87890625, "calib/step_q_w": 0.5784821250000001, "calib/step_q_w_n": 224.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2932.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 318.13671875, "completions/mean_terminated_length": 318.13671875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.11946666666666667, "grad_norm": 0.0022719684056937695, "kl": 0.14599609375, "learning_rate": 2.4444444444444447e-06, "loss": 0.0105, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.0002676117292139679, "mask/share_reasoning": 0.9370044469833374, "mask/share_step_conf": 0.06272794306278229, "num_tokens": 24193704.0, "reward": 0.0003906250058207661, "reward_std": 0.001104854280129075, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 112 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.90234375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.890625, "calib/nonempty_step_conf_rate": 0.890625, "calib/step_conf_rate": 0.890625, "calib/step_q_w": 0.6473191919191921, "calib/step_q_w_n": 231.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 225.890625, "completions/mean_terminated_length": 225.890625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.12053333333333334, "grad_norm": 0.0022563741076737642, "kl": 0.21331787109375, "learning_rate": 2.4166666666666667e-06, "loss": 0.0204, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9262930154800415, "mask/share_step_conf": 0.0737069845199585, "num_tokens": 24356732.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 113 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.90625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.90625, "calib/nonempty_step_conf_rate": 0.90625, "calib/step_conf_rate": 0.90625, "calib/step_q_w": 0.6297916666666667, "calib/step_q_w_n": 232.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2296.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 231.88671875, "completions/mean_terminated_length": 232.7960968017578, "completions/min_length": 0.0, "completions/min_terminated_length": 32.0, "epoch": 0.1216, "grad_norm": 0.0019388310611248016, "kl": 0.197021484375, "learning_rate": 2.388888888888889e-06, "loss": 0.0203, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9167957305908203, "mask/share_step_conf": 0.07929803431034088, "num_tokens": 24521119.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 114 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.8984375, "calib/ece": 0.5, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.5, "calib/mean_conf": 0.5, "calib/mu_c": NaN, "calib/mu_w": 0.5, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.91015625, "calib/nonempty_step_conf_rate": 0.8984375, "calib/pce": 0.5, "calib/std_conf": 0.5, "calib/step_conf_rate": 0.8984375, "calib/step_q_w": 0.5842565217391303, "calib/step_q_w_n": 230.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 237.8046875, "completions/mean_terminated_length": 237.8046875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.12266666666666666, "grad_norm": 0.0018869774648919702, "kl": 0.196868896484375, "learning_rate": 2.361111111111111e-06, "loss": 0.021, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.00013558330829255283, "mask/share_reasoning": 0.9318229556083679, "mask/share_step_conf": 0.06804148852825165, "num_tokens": 24687261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 115 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.88671875, "calib/ece": 0.96, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.96, "calib/mu_c": NaN, "calib/mu_w": 0.96, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.890625, "calib/nonempty_step_conf_rate": 0.88671875, "calib/pce": 0.96, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.88671875, "calib/step_q_w": 0.57208046989721, "calib/step_q_w_n": 227.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2479.0, "completions/max_terminated_length": 2479.0, "completions/mean_length": 273.47265625, "completions/mean_terminated_length": 273.47265625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.12373333333333333, "grad_norm": 0.0021785381250083447, "kl": 0.1786956787109375, "learning_rate": 2.3333333333333336e-06, "loss": 0.0193, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 8.387653360841796e-05, "mask/share_reasoning": 0.9253822565078735, "mask/share_step_conf": 0.07453387975692749, "num_tokens": 24861790.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 116 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.87890625, "calib/ece": 0.765, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.765, "calib/mu_c": NaN, "calib/mu_w": 0.765, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.88671875, "calib/nonempty_step_conf_rate": 0.87890625, "calib/pce": 0.765, "calib/std_conf": 0.015000000000000013, "calib/step_conf_rate": 0.87890625, "calib/step_q_w": 0.6962148148148147, "calib/step_q_w_n": 225.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1893.0, "completions/max_terminated_length": 1893.0, "completions/mean_length": 263.1171875, "completions/mean_terminated_length": 263.1171875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.1248, "grad_norm": 0.0013779336586594582, "kl": 0.170562744140625, "learning_rate": 2.305555555555556e-06, "loss": 0.0194, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.00015250130672939122, "mask/share_reasoning": 0.9313722848892212, "mask/share_step_conf": 0.06847520172595978, "num_tokens": 25035748.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 117 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.9140625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.89453125, "calib/nonempty_step_conf_rate": 0.89453125, "calib/step_conf_rate": 0.89453125, "calib/step_q_w": 0.6240760683760683, "calib/step_q_w_n": 234.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2398.0, "completions/max_terminated_length": 2398.0, "completions/mean_length": 220.796875, "completions/mean_terminated_length": 221.6627655029297, "completions/min_length": 0.0, "completions/min_terminated_length": 27.0, "epoch": 0.12586666666666665, "grad_norm": 0.0024035677779465914, "kl": 0.216339111328125, "learning_rate": 2.277777777777778e-06, "loss": 0.02, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9155505895614624, "mask/share_step_conf": 0.0805431604385376, "num_tokens": 25196280.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 118 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.90625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.90625, "calib/nonempty_step_conf_rate": 0.90625, "calib/step_conf_rate": 0.90625, "calib/step_q_w": 0.6257777298850574, "calib/step_q_w_n": 232.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1235.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 281.3515625, "completions/mean_terminated_length": 281.3515625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.12693333333333334, "grad_norm": 1.403180480003357, "kl": 0.5548095703125, "learning_rate": 2.25e-06, "loss": 0.198, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9420675039291382, "mask/share_step_conf": 0.057932544499635696, "num_tokens": 25373370.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 119 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.94140625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.9453125, "calib/nonempty_step_conf_rate": 0.94140625, "calib/step_conf_rate": 0.94140625, "calib/step_q_w": 0.6420318118948825, "calib/step_q_w_n": 241.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 235.8515625, "completions/mean_terminated_length": 235.8515625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.128, "grad_norm": 0.0016463312786072493, "kl": 0.202301025390625, "learning_rate": 2.222222222222222e-06, "loss": 0.0194, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9314963817596436, "mask/share_step_conf": 0.06850366294384003, "num_tokens": 25540436.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 120 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.92578125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.92578125, "calib/nonempty_step_conf_rate": 0.92578125, "calib/step_conf_rate": 0.92578125, "calib/step_q_w": 0.5966310829817159, "calib/step_q_w_n": 237.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 270.73046875, "completions/mean_terminated_length": 271.79217529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 23.0, "epoch": 0.12906666666666666, "grad_norm": 0.09108592569828033, "kl": 0.239715576171875, "learning_rate": 2.1944444444444445e-06, "loss": 0.0368, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9250909090042114, "mask/share_step_conf": 0.07100285589694977, "num_tokens": 25714799.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 121 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.95703125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.9453125, "calib/nonempty_step_conf_rate": 0.94140625, "calib/step_conf_rate": 0.94140625, "calib/step_q_w": 0.6297778231292517, "calib/step_q_w_n": 245.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2014.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 213.75390625, "completions/mean_terminated_length": 213.75390625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.13013333333333332, "grad_norm": 0.015449677594006062, "kl": 0.256256103515625, "learning_rate": 2.166666666666667e-06, "loss": 0.0222, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.924127459526062, "mask/share_step_conf": 0.07587258517742157, "num_tokens": 25876864.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 122 }, { "adv/mean_abs_final_conf": 0.01880052126944065, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.019320303574204445, "adv/ratio_final_to_reasoning": 0.9751095922596023, "adv/ratio_step_to_reasoning": 1.0020686698297456, "adv/std_final_conf": 0.16110378503799438, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.165557861328125, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.953125, "calib/ece": 0.495, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.5, "calib/mean_conf": 0.495, "calib/mu_c": NaN, "calib/mu_w": 0.495, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.9140625, "calib/nonempty_step_conf_rate": 0.9140625, "calib/pce": 0.495, "calib/std_conf": 0.495, "calib/step_conf_rate": 0.9140625, "calib/step_q_w": 0.540677868852459, "calib/step_q_w_n": 244.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1183.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 247.47265625, "completions/mean_terminated_length": 247.47265625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1312, "grad_norm": 0.01294808741658926, "kl": 0.2294769287109375, "learning_rate": 2.138888888888889e-06, "loss": -0.0594, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.00017276535800192505, "mask/share_reasoning": 0.932379424571991, "mask/share_step_conf": 0.06744778901338577, "num_tokens": 26045505.0, "reward": -0.0010277825640514493, "reward_std": 0.0029070081654936075, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 7.773437391733751e-05, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.0029145495500415564, "step": 123 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.8984375, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.8984375, "calib/nonempty_step_conf_rate": 0.8984375, "calib/step_conf_rate": 0.8984375, "calib/step_q_w": 0.5718039130434783, "calib/step_q_w_n": 230.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 210.28125, "completions/mean_terminated_length": 210.28125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.13226666666666667, "grad_norm": 0.001803459133952856, "kl": 0.2413482666015625, "learning_rate": 2.1111111111111114e-06, "loss": 0.0208, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9198349118232727, "mask/share_step_conf": 0.08016512542963028, "num_tokens": 26206153.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 124 }, { "adv/mean_abs_final_conf": 0.01880052126944065, "adv/mean_abs_reasoning": 0.05784125626087189, "adv/mean_abs_step_conf": 0.01928030140697956, "adv/ratio_final_to_reasoning": 0.32503653075320077, "adv/ratio_step_to_reasoning": 0.33333130456266014, "adv/std_final_conf": 0.16110378503799438, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.16521507501602173, "calib/answer_extract_rate": 0.015625, "calib/auroc": 0.75, "calib/avg_num_step_conf": 0.875, "calib/ece": 0.3225, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.3550000000000001, "calib/mean_conf": 0.7675, "calib/mu_c": 0.9450000000000001, "calib/mu_w": 0.59, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.8828125, "calib/nonempty_step_conf_rate": 0.87109375, "calib/pce": 0.295, "calib/std_conf": 0.3361826140656295, "calib/step_conf_rate": 0.87109375, "calib/step_q_w": 0.5149190476190476, "calib/step_q_w_n": 224.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1395.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 230.515625, "completions/mean_terminated_length": 231.41961669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 31.0, "epoch": 0.13333333333333333, "grad_norm": 0.04646065831184387, "kl": 0.235107421875, "learning_rate": 2.0833333333333334e-06, "loss": -0.0719, "mask/has_final_conf_rate": 0.015625, "mask/share_final_conf": 0.0006107184453867376, "mask/share_reasoning": 0.9152103662490845, "mask/share_step_conf": 0.08027268201112747, "num_tokens": 26369973.0, "reward": 0.0008209494408220053, "reward_std": 0.002321995561942458, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 7.773437391733751e-05, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.0007795855053700507, "step": 125 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/auroc": 0.0, "calib/avg_num_step_conf": 0.890625, "calib/ece": 0.7333333333333334, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": -0.30000000000000004, "calib/mean_conf": 0.7999999999999999, "calib/mu_c": 0.6, "calib/mu_w": 0.9, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.8984375, "calib/nonempty_step_conf_rate": 0.8828125, "calib/pce": 0.6, "calib/std_conf": 0.16329931618554522, "calib/step_conf_rate": 0.8828125, "calib/step_q_w": 0.5426749999999999, "calib/step_q_w_n": 228.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 184.12109375, "completions/mean_terminated_length": 184.12109375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.1344, "grad_norm": 0.004394296556711197, "kl": 0.2568359375, "learning_rate": 2.0555555555555555e-06, "loss": 0.005, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0003749749739654362, "mask/share_reasoning": 0.9051406383514404, "mask/share_step_conf": 0.09448444843292236, "num_tokens": 26522572.0, "reward": 0.0003906250058207661, "reward_std": 0.001104854280129075, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 126 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.921875, "calib/ece": 0.45999999999999996, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.05999999999999994, "calib/mean_conf": 0.96, "calib/mu_c": 0.99, "calib/mu_w": 0.93, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.91796875, "calib/nonempty_step_conf_rate": 0.91015625, "calib/pce": 0.45999999999999996, "calib/std_conf": 0.02999999999999997, "calib/step_conf_rate": 0.91015625, "calib/step_q_w": 0.5421467514124293, "calib/step_q_w_n": 236.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1589.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 188.64453125, "completions/mean_terminated_length": 188.64453125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.13546666666666668, "grad_norm": 0.009678676724433899, "kl": 0.27178955078125, "learning_rate": 2.027777777777778e-06, "loss": 0.0113, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.00019561882072594017, "mask/share_reasoning": 0.9013167023658752, "mask/share_step_conf": 0.09848769754171371, "num_tokens": 26674537.0, "reward": 0.0003906250058207661, "reward_std": 0.001104854280129075, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 127 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/auroc": 0.0, "calib/avg_num_step_conf": 0.88671875, "calib/ece": 0.845, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": -0.69, "calib/mean_conf": 0.615, "calib/mu_c": 0.27, "calib/mu_w": 0.96, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.890625, "calib/nonempty_step_conf_rate": 0.8828125, "calib/pce": 0.48, "calib/std_conf": 0.345, "calib/step_conf_rate": 0.8828125, "calib/step_q_w": 0.5847447870778268, "calib/step_q_w_n": 227.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2550.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 255.796875, "completions/mean_terminated_length": 255.796875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.13653333333333334, "grad_norm": 0.003527594031766057, "kl": 0.225738525390625, "learning_rate": 2.0000000000000003e-06, "loss": 0.015, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.00018033437663689256, "mask/share_reasoning": 0.9266326427459717, "mask/share_step_conf": 0.07318704575300217, "num_tokens": 26846685.0, "reward": 0.0003906250058207661, "reward_std": 0.001104854280129075, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 128 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.921875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.9140625, "calib/nonempty_step_conf_rate": 0.9140625, "calib/step_conf_rate": 0.9140625, "calib/step_q_w": 0.5749310734463278, "calib/step_q_w_n": 236.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 187.78515625, "completions/mean_terminated_length": 187.78515625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.1376, "grad_norm": 0.0018569625681266189, "kl": 0.26055908203125, "learning_rate": 1.9722222222222224e-06, "loss": 0.0208, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.9001195430755615, "mask/share_step_conf": 0.09988044202327728, "num_tokens": 26997142.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 129 }, { "adv/mean_abs_final_conf": 0.019323350861668587, "adv/mean_abs_reasoning": 0.07714889943599701, "adv/mean_abs_step_conf": 0.019258743152022362, "adv/ratio_final_to_reasoning": 0.25046826335739636, "adv/ratio_step_to_reasoning": 0.24963082160361186, "adv/std_final_conf": 0.16558398306369781, "adv/std_reasoning": 0.33054885268211365, "adv/std_step_conf": 0.1650303304195404, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.95703125, "calib/ece": 0.2425, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.5, "calib/mean_conf": 0.7575000000000001, "calib/mu_c": 0.7575000000000001, "calib/mu_w": NaN, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.91796875, "calib/nonempty_step_conf_rate": 0.90625, "calib/pce": 0.0, "calib/std_conf": 0.2621426138574192, "calib/step_conf_rate": 0.90625, "calib/step_q_c": 0.55, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.0374379781420765, "calib/step_q_w": 0.5125620218579235, "calib/step_q_w_n": 244.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 198.6640625, "completions/mean_terminated_length": 198.6640625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.13866666666666666, "grad_norm": 0.029257914051413536, "kl": 0.258148193359375, "learning_rate": 1.944444444444445e-06, "loss": -0.0872, "mask/has_final_conf_rate": 0.015625, "mask/share_final_conf": 0.0011932657798752189, "mask/share_reasoning": 0.9098485708236694, "mask/share_step_conf": 0.08895816653966904, "num_tokens": 27153288.0, "reward": 0.0035145406145602465, "reward_std": 0.009940622374415398, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.0036812499165534973, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.000558418920263648, "step": 130 }, { "adv/mean_abs_final_conf": 0.038646847009658813, "adv/mean_abs_reasoning": 0.038560837507247925, "adv/mean_abs_step_conf": 0.0379277728497982, "adv/ratio_final_to_reasoning": 1.002230488442963, "adv/ratio_step_to_reasoning": 0.9835827046720463, "adv/std_final_conf": 0.23417198657989502, "adv/std_reasoning": 0.233650803565979, "adv/std_step_conf": 0.22983452677726746, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.96875, "calib/ece": 0.53, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.25, "calib/mean_conf": 0.53, "calib/mu_c": NaN, "calib/mu_w": 0.53, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.94921875, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.53, "calib/std_conf": 0.3463379852109786, "calib/step_conf_rate": 0.94140625, "calib/step_q_w": 0.5664596774193549, "calib/step_q_w_n": 248.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2109.0, "completions/max_terminated_length": 2109.0, "completions/mean_length": 190.38671875, "completions/mean_terminated_length": 190.38671875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.13973333333333332, "grad_norm": 0.027975257486104965, "kl": 0.25921630859375, "learning_rate": 1.916666666666667e-06, "loss": -0.1447, "mask/has_final_conf_rate": 0.015625, "mask/share_final_conf": 0.0005259591853246093, "mask/share_reasoning": 0.9087549448013306, "mask/share_step_conf": 0.09071913361549377, "num_tokens": 27308235.0, "reward": 0.004286515526473522, "reward_std": 0.012124096974730492, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.00742187537252903, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.00041134387720376253, "step": 131 }, { "adv/mean_abs_final_conf": 0.057708740234375, "adv/mean_abs_reasoning": 0.0578957125544548, "adv/mean_abs_step_conf": 0.0578995943069458, "adv/ratio_final_to_reasoning": 0.9967705325346166, "adv/ratio_step_to_reasoning": 1.0000670473221545, "adv/std_final_conf": 0.2855128347873688, "adv/std_reasoning": 0.2864321172237396, "adv/std_step_conf": 0.2864512503147125, "calib/answer_extract_rate": 0.015625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 0.94921875, "calib/ece": 0.505, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.25, "calib/gap": -0.010000000000000009, "calib/mean_conf": 0.72, "calib/mu_c": 0.715, "calib/mu_w": 0.725, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.92578125, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.3625, "calib/std_conf": 0.18069310999592653, "calib/step_conf_rate": 0.921875, "calib/step_q_c": 0.7, "calib/step_q_c_n": 2.0, "calib/step_q_gap": 0.16071535269709536, "calib/step_q_w": 0.5392846473029046, "calib/step_q_w_n": 241.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1597.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 198.7109375, "completions/mean_terminated_length": 198.7109375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.1408, "grad_norm": 0.05701402947306633, "kl": 0.2595062255859375, "learning_rate": 1.888888888888889e-06, "loss": -0.239, "mask/has_final_conf_rate": 0.015625, "mask/share_final_conf": 0.0011298644822090864, "mask/share_reasoning": 0.8932276964187622, "mask/share_step_conf": 0.1056424006819725, "num_tokens": 27464697.0, "reward": 0.003786170156672597, "reward_std": 0.011870051734149456, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.007330859545618296, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.0036647694651037455, "step": 132 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.9375, "calib/ece": 0.74, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.74, "calib/mu_c": NaN, "calib/mu_w": 0.74, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.9140625, "calib/pce": 0.74, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.9140625, "calib/step_q_w": 0.5189918055555557, "calib/step_q_w_n": 240.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1224.0, "completions/max_terminated_length": 1224.0, "completions/mean_length": 179.5078125, "completions/mean_terminated_length": 179.5078125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.14186666666666667, "grad_norm": 0.002950991503894329, "kl": 0.274261474609375, "learning_rate": 1.8611111111111113e-06, "loss": 0.021, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.00012317004438955337, "mask/share_reasoning": 0.9076688289642334, "mask/share_step_conf": 0.09220802038908005, "num_tokens": 27616995.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 133 }, { "adv/mean_abs_final_conf": 0.038023941218853, "adv/mean_abs_reasoning": 0.038560837507247925, "adv/mean_abs_step_conf": 0.038561079651117325, "adv/ratio_final_to_reasoning": 0.98607664347814, "adv/ratio_step_to_reasoning": 1.000006279528274, "adv/std_final_conf": 0.23041187226772308, "adv/std_reasoning": 0.233650803565979, "adv/std_step_conf": 0.23365232348442078, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.890625, "calib/ece": 0.97, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.97, "calib/mu_c": NaN, "calib/mu_w": 0.97, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.875, "calib/nonempty_step_conf_rate": 0.875, "calib/pce": 0.97, "calib/std_conf": 0.020000000000000018, "calib/step_conf_rate": 0.875, "calib/step_q_w": 0.5715896198830409, "calib/step_q_w_n": 228.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2362.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 226.83984375, "completions/mean_terminated_length": 226.83984375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.14293333333333333, "grad_norm": 0.023352211341261864, "kl": 0.225128173828125, "learning_rate": 1.8333333333333333e-06, "loss": -0.0386, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.0007759142317809165, "mask/share_reasoning": 0.9204539060592651, "mask/share_step_conf": 0.07877011597156525, "num_tokens": 27784018.0, "reward": 0.00019159464864060283, "reward_std": 0.0005419114604592323, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0004585937422234565, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.0016379044391214848, "step": 134 }, { "adv/mean_abs_final_conf": 0.03847457841038704, "adv/mean_abs_reasoning": 0.03858806565403938, "adv/mean_abs_step_conf": 0.03789190202951431, "adv/ratio_final_to_reasoning": 0.9970590066713939, "adv/ratio_step_to_reasoning": 0.9819590950537269, "adv/std_final_conf": 0.23313045501708984, "adv/std_reasoning": 0.23381583392620087, "adv/std_step_conf": 0.22963960468769073, "calib/answer_extract_rate": 0.01171875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 0.98046875, "calib/ece": 0.5566666666666666, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": 0.014999999999999902, "calib/mean_conf": 0.6900000000000001, "calib/mu_c": 0.7, "calib/mu_w": 0.685, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.92578125, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.45666666666666667, "calib/std_conf": 0.2328089345364563, "calib/step_conf_rate": 0.921875, "calib/step_q_c": 0.98, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.46262933333333334, "calib/step_q_w": 0.5173706666666666, "calib/step_q_w_n": 250.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2731.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 213.55859375, "completions/mean_terminated_length": 213.55859375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.144, "grad_norm": 0.03867189958691597, "kl": 0.24249267578125, "learning_rate": 1.8055555555555557e-06, "loss": -0.137, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0007803092012181878, "mask/share_reasoning": 0.9052779674530029, "mask/share_step_conf": 0.09394178539514542, "num_tokens": 27944569.0, "reward": 0.002227193210273981, "reward_std": 0.008019620552659035, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0037855468690395355, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.0016749105416238308, "step": 135 }, { "adv/mean_abs_final_conf": 0.057971805334091187, "adv/mean_abs_reasoning": 0.07720336318016052, "adv/mean_abs_step_conf": 0.05711377039551735, "adv/ratio_final_to_reasoning": 0.75089740843037, "adv/ratio_step_to_reasoning": 0.7397834503949987, "adv/std_final_conf": 0.28680849075317383, "adv/std_reasoning": 0.3307822048664093, "adv/std_step_conf": 0.2825852930545807, "calib/answer_extract_rate": 0.01953125, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.96484375, "calib/ece": 0.057999999999999996, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.6, "calib/gap": 0.8525, "calib/mean_conf": 0.7819999999999999, "calib/mu_c": 0.9525, "calib/mu_w": 0.1, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.92578125, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.020000000000000004, "calib/std_conf": 0.34510288321021027, "calib/step_conf_rate": 0.91796875, "calib/step_q_c": 0.7666666666666666, "calib/step_q_c_n": 3.0, "calib/step_q_gap": 0.22086092896174858, "calib/step_q_w": 0.545805737704918, "calib/step_q_w_n": 244.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 171.26953125, "completions/mean_terminated_length": 171.94119262695312, "completions/min_length": 0.0, "completions/min_terminated_length": 29.0, "epoch": 0.14506666666666668, "grad_norm": 0.021164115518331528, "kl": 0.27850341796875, "learning_rate": 1.777777777777778e-06, "loss": -0.1423, "mask/has_final_conf_rate": 0.015625, "mask/share_final_conf": 0.001021144213154912, "mask/share_reasoning": 0.8896456956863403, "mask/share_step_conf": 0.10542689263820648, "num_tokens": 28096902.0, "reward": 0.008188535459339619, "reward_std": 0.023160677403211594, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.01162890624254942, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.0007205858128145337, "step": 136 }, { "adv/mean_abs_final_conf": 0.038537055253982544, "adv/mean_abs_reasoning": 0.038560837507247925, "adv/mean_abs_step_conf": 0.038625482469797134, "adv/ratio_final_to_reasoning": 0.9993832537153554, "adv/ratio_step_to_reasoning": 1.0016764408329322, "adv/std_final_conf": 0.2335069626569748, "adv/std_reasoning": 0.233650803565979, "adv/std_step_conf": 0.23404252529144287, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 1.02734375, "calib/ece": 0.6466666666666666, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/mean_conf": 0.6466666666666666, "calib/mu_c": NaN, "calib/mu_w": 0.6466666666666666, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.94921875, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.6466666666666666, "calib/std_conf": 0.3531131389355101, "calib/step_conf_rate": 0.9453125, "calib/step_q_w": 0.5505894803548796, "calib/step_q_w_n": 263.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2118.0, "completions/max_terminated_length": 2118.0, "completions/mean_length": 186.296875, "completions/mean_terminated_length": 186.296875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.14613333333333334, "grad_norm": 0.027069352567195892, "kl": 0.2666015625, "learning_rate": 1.75e-06, "loss": -0.0801, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0010379692539572716, "mask/share_reasoning": 0.8899767398834229, "mask/share_step_conf": 0.10898531973361969, "num_tokens": 28251578.0, "reward": -0.0004488623235374689, "reward_std": 0.001269574393518269, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0015386719023808837, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.003998896572738886, "step": 137 }, { "adv/mean_abs_final_conf": 0.082949697971344, "adv/mean_abs_reasoning": 0.08451591432094574, "adv/mean_abs_step_conf": 0.07661990821361542, "adv/ratio_final_to_reasoning": 0.9814683854255649, "adv/ratio_step_to_reasoning": 0.9065737361919134, "adv/std_final_conf": 0.3310111463069916, "adv/std_reasoning": 0.3308088481426239, "adv/std_step_conf": 0.32838961482048035, "calib/answer_extract_rate": 0.0234375, "calib/auroc": 0.75, "calib/avg_num_step_conf": 0.94140625, "calib/ece": 0.24833333333333332, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": 0.0675, "calib/mean_conf": 0.9050000000000001, "calib/mu_c": 0.9275, "calib/mu_w": 0.86, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.24333333333333332, "calib/std_conf": 0.059651767227244266, "calib/step_conf_rate": 0.921875, "calib/step_q_c": 0.82, "calib/step_q_c_n": 4.0, "calib/step_q_gap": 0.24735808720112507, "calib/step_q_w": 0.5726419127988749, "calib/step_q_w_n": 237.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 190.85546875, "completions/mean_terminated_length": 191.6039276123047, "completions/min_length": 0.0, "completions/min_terminated_length": 27.0, "epoch": 0.1472, "grad_norm": 0.043301813304424286, "kl": 0.2565765380859375, "learning_rate": 1.7222222222222224e-06, "loss": -0.2439, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0022418617736548185, "mask/share_reasoning": 0.8968058824539185, "mask/share_step_conf": 0.09704601764678955, "num_tokens": 28404773.0, "reward": 0.011221060529351234, "reward_std": 0.02459767647087574, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.0175175778567791, "rewards/format_reward_step": 0.0234375, "rewards/step_l1_reward": -0.0028879554010927677, "step": 138 }, { "adv/mean_abs_final_conf": 0.01932401955127716, "adv/mean_abs_reasoning": 0.038560837507247925, "adv/mean_abs_step_conf": 0.019282005727291107, "adv/ratio_final_to_reasoning": 0.5011307015218485, "adv/ratio_step_to_reasoning": 0.5000411550622272, "adv/std_final_conf": 0.16558970510959625, "adv/std_reasoning": 0.233650803565979, "adv/std_step_conf": 0.1652296781539917, "calib/answer_extract_rate": 0.01171875, "calib/auroc": 0.25, "calib/avg_num_step_conf": 0.9609375, "calib/ece": 0.6266666666666667, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.44, "calib/mean_conf": 0.29333333333333333, "calib/mu_c": 0.0, "calib/mu_w": 0.44, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.29333333333333333, "calib/std_conf": 0.41483597829610785, "calib/step_conf_rate": 0.9453125, "calib/step_q_w": 0.5567165311653116, "calib/step_q_w_n": 246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1581.0, "completions/max_terminated_length": 1581.0, "completions/mean_length": 172.96484375, "completions/mean_terminated_length": 172.96484375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.14826666666666666, "grad_norm": 0.011977670714259148, "kl": 0.289459228515625, "learning_rate": 1.6944444444444446e-06, "loss": -0.0227, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.000581751111894846, "mask/share_reasoning": 0.8981517553329468, "mask/share_step_conf": 0.10126648843288422, "num_tokens": 28552148.0, "reward": 0.002332001691684127, "reward_std": 0.006595896556973457, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.00390625, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.0008047465817071497, "step": 139 }, { "adv/mean_abs_final_conf": 0.07728341221809387, "adv/mean_abs_reasoning": 0.11568251252174377, "adv/mean_abs_step_conf": 0.07716748863458633, "adv/ratio_final_to_reasoning": 0.6680647794848649, "adv/ratio_step_to_reasoning": 0.6670626955831537, "adv/std_final_conf": 0.3311251401901245, "adv/std_reasoning": 0.4046950340270996, "adv/std_step_conf": 0.33062857389450073, "calib/answer_extract_rate": 0.02734375, "calib/auroc": 0.8500000000000001, "calib/avg_num_step_conf": 0.95703125, "calib/ece": 0.33, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 0.42857142857142855, "calib/gap": 0.5309999999999999, "calib/mean_conf": 0.6157142857142858, "calib/mu_c": 0.995, "calib/mu_w": 0.4640000000000001, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.9453125, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.33, "calib/std_conf": 0.3848880038533341, "calib/step_conf_rate": 0.93359375, "calib/step_q_w": 0.5250217687074831, "calib/step_q_w_n": 245.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1276.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 181.58984375, "completions/mean_terminated_length": 181.58984375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.14933333333333335, "grad_norm": 0.06448958069086075, "kl": 0.28741455078125, "learning_rate": 1.6666666666666667e-06, "loss": -0.3303, "mask/has_final_conf_rate": 0.02734375, "mask/share_final_conf": 0.0017336525488644838, "mask/share_reasoning": 0.8995578289031982, "mask/share_step_conf": 0.09870850294828415, "num_tokens": 28703651.0, "reward": 0.006518370937556028, "reward_std": 0.0184367373585701, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.012849219143390656, "rewards/format_reward_step": 0.015625, "rewards/step_l1_reward": -0.004499976523220539, "step": 140 }, { "adv/mean_abs_final_conf": 0.0772620216012001, "adv/mean_abs_reasoning": 0.09642931818962097, "adv/mean_abs_step_conf": 0.07724109292030334, "adv/ratio_final_to_reasoning": 0.8012295746950131, "adv/ratio_step_to_reasoning": 0.8010125382035219, "adv/std_final_conf": 0.33103352785110474, "adv/std_reasoning": 0.3695387542247772, "adv/std_step_conf": 0.33094385266304016, "calib/answer_extract_rate": 0.02734375, "calib/auroc": 0.45, "calib/avg_num_step_conf": 0.99609375, "calib/ece": 0.6028571428571428, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 0.2857142857142857, "calib/gap": -0.16700000000000004, "calib/mean_conf": 0.6142857142857144, "calib/mu_c": 0.495, "calib/mu_w": 0.662, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.46571428571428575, "calib/std_conf": 0.38858193263113366, "calib/step_conf_rate": 0.91796875, "calib/step_q_c": 0.7, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.12838582677165356, "calib/step_q_w": 0.5716141732283464, "calib/step_q_w_n": 254.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2185.0, "completions/max_terminated_length": 2185.0, "completions/mean_length": 215.9375, "completions/mean_terminated_length": 215.9375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1504, "grad_norm": 0.036474522203207016, "kl": 0.249755859375, "learning_rate": 1.638888888888889e-06, "loss": -0.2731, "mask/has_final_conf_rate": 0.02734375, "mask/share_final_conf": 0.0012745312415063381, "mask/share_reasoning": 0.9076339602470398, "mask/share_step_conf": 0.09109152853488922, "num_tokens": 28866027.0, "reward": 0.0038409747648984194, "reward_std": 0.013156676664948463, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.010916406288743019, "rewards/format_reward_step": 0.015625, "rewards/step_l1_reward": -0.007921956479549408, "step": 141 }, { "adv/mean_abs_final_conf": 0.06394700706005096, "adv/mean_abs_reasoning": 0.08143529295921326, "adv/mean_abs_step_conf": 0.05882935971021652, "adv/ratio_final_to_reasoning": 0.7852493032975116, "adv/ratio_step_to_reasoning": 0.7224061899019768, "adv/std_final_conf": 0.28679755330085754, "adv/std_reasoning": 0.33055466413497925, "adv/std_step_conf": 0.2864469587802887, "calib/answer_extract_rate": 0.0234375, "calib/auroc": 1.0, "calib/avg_num_step_conf": 1.01953125, "calib/ece": 0.16166666666666665, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 0.16666666666666666, "calib/gap": 0.73, "calib/mean_conf": 0.4583333333333333, "calib/mu_c": 0.9450000000000001, "calib/mu_w": 0.21500000000000002, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.91015625, "calib/nonempty_step_conf_rate": 0.90234375, "calib/pce": 0.14333333333333334, "calib/std_conf": 0.3875743656240553, "calib/step_conf_rate": 0.90234375, "calib/step_q_c": 0.8, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.23791679487179485, "calib/step_q_w": 0.5620832051282052, "calib/step_q_w_n": 260.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 196.19921875, "completions/mean_terminated_length": 197.7440948486328, "completions/min_length": 0.0, "completions/min_terminated_length": 31.0, "epoch": 0.15146666666666667, "grad_norm": 0.05591246113181114, "kl": 0.268585205078125, "learning_rate": 1.6111111111111113e-06, "loss": -0.2564, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0022723714355379343, "mask/share_reasoning": 0.8812129497528076, "mask/share_step_conf": 0.10870218276977539, "num_tokens": 29021414.0, "reward": 0.007314523681998253, "reward_std": 0.01666373386979103, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.01413437444716692, "rewards/format_reward_step": 0.015625, "rewards/step_l1_reward": -0.004192827269434929, "step": 142 }, { "adv/mean_abs_final_conf": 0.05766920745372772, "adv/mean_abs_reasoning": 0.07712167501449585, "adv/mean_abs_step_conf": 0.05647395923733711, "adv/ratio_final_to_reasoning": 0.7477691251245279, "adv/ratio_step_to_reasoning": 0.7322709112155853, "adv/std_final_conf": 0.28531643748283386, "adv/std_reasoning": 0.33043211698532104, "adv/std_step_conf": 0.279586523771286, "calib/answer_extract_rate": 0.01953125, "calib/auroc": 0.75, "calib/avg_num_step_conf": 0.9296875, "calib/ece": 0.542, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.2, "calib/gap": 0.1975, "calib/mean_conf": 0.742, "calib/mu_c": 0.9, "calib/mu_w": 0.7025, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.87109375, "calib/nonempty_step_conf_rate": 0.86328125, "calib/pce": 0.542, "calib/std_conf": 0.2530138336138955, "calib/step_conf_rate": 0.86328125, "calib/step_q_w": 0.5040534313725491, "calib/step_q_w_n": 238.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2342.0, "completions/max_terminated_length": 2342.0, "completions/mean_length": 191.19140625, "completions/mean_terminated_length": 191.19140625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.15253333333333333, "grad_norm": 0.05209788307547569, "kl": 0.2831268310546875, "learning_rate": 1.5833333333333333e-06, "loss": -0.2249, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.0014132431242614985, "mask/share_reasoning": 0.89207923412323, "mask/share_step_conf": 0.10650746524333954, "num_tokens": 29177695.0, "reward": 0.0009553575655445457, "reward_std": 0.007277060765773058, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0031640625093132257, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.0043783471919596195, "step": 143 }, { "adv/mean_abs_final_conf": 0.07675210386514664, "adv/mean_abs_reasoning": 0.07714889943599701, "adv/mean_abs_step_conf": 0.07728776335716248, "adv/ratio_final_to_reasoning": 0.9948567565609986, "adv/ratio_step_to_reasoning": 1.0017999468842802, "adv/std_final_conf": 0.32887107133865356, "adv/std_reasoning": 0.33054885268211365, "adv/std_step_conf": 0.3311437666416168, "calib/answer_extract_rate": 0.0234375, "calib/auroc": 0.0, "calib/avg_num_step_conf": 0.91796875, "calib/ece": 0.665, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 0.16666666666666666, "calib/gap": -0.26200000000000007, "calib/mean_conf": 0.6383333333333333, "calib/mu_c": 0.42, "calib/mu_w": 0.682, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.91796875, "calib/nonempty_step_conf_rate": 0.91015625, "calib/pce": 0.5683333333333334, "calib/std_conf": 0.18968541207893544, "calib/step_conf_rate": 0.91015625, "calib/step_q_c": 0.0, "calib/step_q_c_n": 1.0, "calib/step_q_gap": -0.5767234620886981, "calib/step_q_w": 0.5767234620886981, "calib/step_q_w_n": 233.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2758.0, "completions/max_terminated_length": 2758.0, "completions/mean_length": 186.35546875, "completions/mean_terminated_length": 186.35546875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.1536, "grad_norm": 0.04337361454963684, "kl": 0.285064697265625, "learning_rate": 1.5555555555555558e-06, "loss": -0.2841, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0018886120524257421, "mask/share_reasoning": 0.8982242345809937, "mask/share_step_conf": 0.09988721460103989, "num_tokens": 29329530.0, "reward": -0.0010638143867254257, "reward_std": 0.00474740844219923, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.007497265934944153, "rewards/format_reward_step": 0.015625, "rewards/step_l1_reward": -0.013531144708395004, "step": 144 }, { "adv/mean_abs_final_conf": 0.05796315148472786, "adv/mean_abs_reasoning": 0.057868484407663345, "adv/mean_abs_step_conf": 0.05754847824573517, "adv/ratio_final_to_reasoning": 1.0016359004047457, "adv/ratio_step_to_reasoning": 0.9944701133059949, "adv/std_final_conf": 0.2867657244205475, "adv/std_reasoning": 0.28629741072654724, "adv/std_step_conf": 0.28471487760543823, "calib/answer_extract_rate": 0.015625, "calib/auroc": 0.6666666666666667, "calib/avg_num_step_conf": 0.98046875, "calib/ece": 0.49, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.1466666666666666, "calib/mean_conf": 0.59, "calib/mu_c": 0.7, "calib/mu_w": 0.5533333333333333, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.94140625, "calib/nonempty_step_conf_rate": 0.9375, "calib/pce": 0.41500000000000004, "calib/std_conf": 0.26767517628648346, "calib/step_conf_rate": 0.9375, "calib/step_q_c": 0.65, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.171516, "calib/step_q_w": 0.478484, "calib/step_q_w_n": 250.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 139.59375, "completions/mean_terminated_length": 139.59375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.15466666666666667, "grad_norm": 0.05382170528173447, "kl": 0.33148193359375, "learning_rate": 1.527777777777778e-06, "loss": -0.1813, "mask/has_final_conf_rate": 0.015625, "mask/share_final_conf": 0.002252190839499235, "mask/share_reasoning": 0.8799644112586975, "mask/share_step_conf": 0.11778340488672256, "num_tokens": 29467970.0, "reward": 0.0059430343098938465, "reward_std": 0.016809439286589622, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.009679296985268593, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.0009182285284623504, "step": 145 }, { "adv/mean_abs_final_conf": 0.09656653553247452, "adv/mean_abs_reasoning": 0.09645655006170273, "adv/mean_abs_step_conf": 0.09653792530298233, "adv/ratio_final_to_reasoning": 1.0011402592224317, "adv/ratio_step_to_reasoning": 1.0008436466080068, "adv/std_final_conf": 0.3700646162033081, "adv/std_reasoning": 0.36964312195777893, "adv/std_step_conf": 0.36995500326156616, "calib/answer_extract_rate": 0.03125, "calib/auroc": 0.55, "calib/avg_num_step_conf": 0.91796875, "calib/ece": 0.36714285714285705, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.026999999999999913, "calib/mean_conf": 0.5157142857142858, "calib/mu_c": 0.5349999999999999, "calib/mu_w": 0.508, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.91015625, "calib/nonempty_step_conf_rate": 0.8984375, "calib/pce": 0.29857142857142854, "calib/std_conf": 0.3107068964638604, "calib/step_conf_rate": 0.8984375, "calib/step_q_c": 0.386, "calib/step_q_c_n": 5.0, "calib/step_q_gap": -0.15236942028985512, "calib/step_q_w": 0.5383694202898551, "calib/step_q_w_n": 230.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 196.71484375, "completions/mean_terminated_length": 196.71484375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.15573333333333333, "grad_norm": 0.06925830245018005, "kl": 0.254302978515625, "learning_rate": 1.5e-06, "loss": -0.3652, "mask/has_final_conf_rate": 0.02734375, "mask/share_final_conf": 0.0016984788235276937, "mask/share_reasoning": 0.9018841981887817, "mask/share_step_conf": 0.09641735255718231, "num_tokens": 29625545.0, "reward": 0.003927886951714754, "reward_std": 0.015175402164459229, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.012025780975818634, "rewards/format_reward_step": 0.01953125, "rewards/step_l1_reward": -0.009638756513595581, "step": 146 }, { "adv/mean_abs_final_conf": 0.10098693519830704, "adv/mean_abs_reasoning": 0.12002336233854294, "adv/mean_abs_step_conf": 0.10241132974624634, "adv/ratio_final_to_reasoning": 0.8413939855597367, "adv/ratio_step_to_reasoning": 0.8532616296599044, "adv/std_final_conf": 0.3682190775871277, "adv/std_reasoning": 0.40489038825035095, "adv/std_step_conf": 0.36968347430229187, "calib/answer_extract_rate": 0.03125, "calib/auroc": 0.7, "calib/avg_num_step_conf": 0.9609375, "calib/ece": 0.43125, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.375, "calib/gap": 0.19933333333333336, "calib/mean_conf": 0.60875, "calib/mu_c": 0.7333333333333334, "calib/mu_w": 0.534, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.94921875, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.3325, "calib/std_conf": 0.3560701300305882, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.635, "calib/step_q_c_n": 2.0, "calib/step_q_gap": 0.07624316939890718, "calib/step_q_w": 0.5587568306010928, "calib/step_q_w_n": 244.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2393.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 194.4140625, "completions/mean_terminated_length": 195.17648315429688, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1568, "grad_norm": 0.04261239245533943, "kl": 0.2639923095703125, "learning_rate": 1.4722222222222225e-06, "loss": -0.4112, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.0023956787772476673, "mask/share_reasoning": 0.890393853187561, "mask/share_step_conf": 0.10330420732498169, "num_tokens": 29778995.0, "reward": 0.005495469085872173, "reward_std": 0.021126050502061844, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.01336992159485817, "rewards/format_reward_step": 0.0234375, "rewards/step_l1_reward": -0.009410234168171883, "step": 147 }, { "adv/mean_abs_final_conf": 0.019322939217090607, "adv/mean_abs_reasoning": 0.03858806565403938, "adv/mean_abs_step_conf": 0.019289027899503708, "adv/ratio_final_to_reasoning": 0.500749101816351, "adv/ratio_step_to_reasoning": 0.499870298564306, "adv/std_final_conf": 0.16558043658733368, "adv/std_reasoning": 0.23381584882736206, "adv/std_step_conf": 0.16528984904289246, "calib/answer_extract_rate": 0.0078125, "calib/auroc": 0.25, "calib/avg_num_step_conf": 0.984375, "calib/ece": 0.3666666666666666, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.19999999999999996, "calib/mean_conf": 0.5666666666666667, "calib/mu_c": 0.5, "calib/mu_w": 0.7, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.91796875, "calib/nonempty_step_conf_rate": 0.9140625, "calib/pce": 0.1333333333333333, "calib/std_conf": 0.18856180831641264, "calib/step_conf_rate": 0.9140625, "calib/step_q_c": 0.8, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.2550377158034529, "calib/step_q_w": 0.5449622841965471, "calib/step_q_w_n": 251.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 177.83203125, "completions/mean_terminated_length": 177.83203125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.15786666666666666, "grad_norm": 0.017495930194854736, "kl": 0.280548095703125, "learning_rate": 1.4444444444444445e-06, "loss": -0.0763, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.000960962031967938, "mask/share_reasoning": 0.8899896144866943, "mask/share_step_conf": 0.10904946178197861, "num_tokens": 29929632.0, "reward": 0.002485139761120081, "reward_std": 0.007029036059975624, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.003554687602445483, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.0009281584643758833, "step": 148 }, { "adv/mean_abs_final_conf": 0.09658879786729813, "adv/mean_abs_reasoning": 0.11573696881532669, "adv/mean_abs_step_conf": 0.09627419710159302, "adv/ratio_final_to_reasoning": 0.8345544112306764, "adv/ratio_step_to_reasoning": 0.831836172029103, "adv/std_final_conf": 0.3701499104499817, "adv/std_reasoning": 0.4048856496810913, "adv/std_step_conf": 0.3689470887184143, "calib/answer_extract_rate": 0.03125, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.96875, "calib/ece": 0.38499999999999995, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.375, "calib/gap": 0.376, "calib/mean_conf": 0.745, "calib/mu_c": 0.98, "calib/mu_w": 0.604, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.91015625, "calib/pce": 0.37749999999999995, "calib/std_conf": 0.27013885318480196, "calib/step_conf_rate": 0.91015625, "calib/step_q_c": 0.865, "calib/step_q_c_n": 2.0, "calib/step_q_gap": 0.3331924119241193, "calib/step_q_w": 0.5318075880758807, "calib/step_q_w_n": 246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2090.0, "completions/max_terminated_length": 2090.0, "completions/mean_length": 193.4765625, "completions/mean_terminated_length": 194.2353057861328, "completions/min_length": 0.0, "completions/min_terminated_length": 25.0, "epoch": 0.15893333333333334, "grad_norm": 0.05413132533431053, "kl": 0.2761993408203125, "learning_rate": 1.4166666666666667e-06, "loss": -0.323, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.0027247373946011066, "mask/share_reasoning": 0.8881813287734985, "mask/share_step_conf": 0.10518766939640045, "num_tokens": 30083618.0, "reward": 0.008374178782105446, "reward_std": 0.0236857570707798, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.01495507825165987, "rewards/format_reward_step": 0.01953125, "rewards/step_l1_reward": -0.004456719383597374, "step": 149 }, { "adv/mean_abs_final_conf": 0.03864767402410507, "adv/mean_abs_reasoning": 0.05784125626087189, "adv/mean_abs_step_conf": 0.03861791640520096, "adv/ratio_final_to_reasoning": 0.6681679569648148, "adv/ratio_step_to_reasoning": 0.6676534864842654, "adv/std_final_conf": 0.23417697846889496, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.2339966893196106, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.9765625, "calib/ece": 0.38666666666666666, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/mean_conf": 0.38666666666666666, "calib/mu_c": NaN, "calib/mu_w": 0.38666666666666666, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.38666666666666666, "calib/std_conf": 0.4259368758656876, "calib/step_conf_rate": 0.91796875, "calib/step_q_w": 0.5733213333333333, "calib/step_q_w_n": 250.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2010.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 195.73828125, "completions/mean_terminated_length": 195.73828125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.16, "grad_norm": 0.03214937448501587, "kl": 0.2613067626953125, "learning_rate": 1.3888888888888892e-06, "loss": -0.1478, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0011974748922511935, "mask/share_reasoning": 0.894477128982544, "mask/share_step_conf": 0.10432544350624084, "num_tokens": 30238687.0, "reward": 0.0031226295977830887, "reward_std": 0.008832130581140518, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.007685937453061342, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.003784427884966135, "step": 150 }, { "adv/mean_abs_final_conf": 0.03864070400595665, "adv/mean_abs_reasoning": 0.03861529380083084, "adv/mean_abs_step_conf": 0.038446031510829926, "adv/ratio_final_to_reasoning": 1.0006580347480163, "adv/ratio_step_to_reasoning": 0.9956167032970425, "adv/std_final_conf": 0.2341347485780716, "adv/std_reasoning": 0.23398077487945557, "adv/std_step_conf": 0.2329564392566681, "calib/answer_extract_rate": 0.015625, "calib/auroc": 0.6666666666666667, "calib/avg_num_step_conf": 0.90234375, "calib/ece": 0.422, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.2, "calib/gap": 0.15000000000000008, "calib/mean_conf": 0.4699999999999999, "calib/mu_c": 0.56, "calib/mu_w": 0.41, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.90625, "calib/nonempty_step_conf_rate": 0.8984375, "calib/pce": 0.246, "calib/std_conf": 0.35162480003549235, "calib/step_conf_rate": 0.8984375, "calib/step_q_c": 0.835, "calib/step_q_c_n": 2.0, "calib/step_q_gap": 0.3039599708879185, "calib/step_q_w": 0.5310400291120815, "calib/step_q_w_n": 229.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1884.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 212.734375, "completions/mean_terminated_length": 212.734375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.16106666666666666, "grad_norm": 0.018881434574723244, "kl": 0.2248992919921875, "learning_rate": 1.3611111111111112e-06, "loss": -0.054, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.0012571100378409028, "mask/share_reasoning": 0.9030216932296753, "mask/share_step_conf": 0.09572114050388336, "num_tokens": 30400171.0, "reward": 0.004041813313961029, "reward_std": 0.011431975290179253, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.006074219010770321, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.0011155917309224606, "step": 151 }, { "adv/mean_abs_final_conf": 0.05794394388794899, "adv/mean_abs_reasoning": 0.05784125626087189, "adv/mean_abs_step_conf": 0.057304129004478455, "adv/ratio_final_to_reasoning": 1.001775335352572, "adv/ratio_step_to_reasoning": 0.9907137691828318, "adv/std_final_conf": 0.2866707146167755, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.2835402190685272, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.9765625, "calib/ece": 0.5714285714285714, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.14285714285714285, "calib/mean_conf": 0.5714285714285714, "calib/mu_c": NaN, "calib/mu_w": 0.5714285714285714, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.90625, "calib/nonempty_step_conf_rate": 0.89453125, "calib/pce": 0.5714285714285714, "calib/std_conf": 0.26264743187282025, "calib/step_conf_rate": 0.89453125, "calib/step_q_w": 0.47026093333333335, "calib/step_q_w_n": 250.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1148.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 183.16015625, "completions/mean_terminated_length": 183.87844848632812, "completions/min_length": 0.0, "completions/min_terminated_length": 27.0, "epoch": 0.16213333333333332, "grad_norm": 0.03377218544483185, "kl": 0.319915771484375, "learning_rate": 1.3333333333333334e-06, "loss": -0.1464, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0020097133237868547, "mask/share_reasoning": 0.887736976146698, "mask/share_step_conf": 0.10634706169366837, "num_tokens": 30552452.0, "reward": 0.0029946179129183292, "reward_std": 0.010179774835705757, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.00791757833212614, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.004272093065083027, "step": 152 }, { "adv/mean_abs_final_conf": 0.01932401955127716, "adv/mean_abs_reasoning": 0.038560837507247925, "adv/mean_abs_step_conf": 0.01273763831704855, "adv/ratio_final_to_reasoning": 0.5011307015218485, "adv/ratio_step_to_reasoning": 0.33032576936780417, "adv/std_final_conf": 0.16558970510959625, "adv/std_reasoning": 0.233650803565979, "adv/std_step_conf": 0.10915025323629379, "calib/answer_extract_rate": 0.015625, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.9765625, "calib/ece": 0.3233333333333333, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.465, "calib/mean_conf": 0.59, "calib/mu_c": 0.9, "calib/mu_w": 0.435, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.91796875, "calib/nonempty_step_conf_rate": 0.90625, "calib/pce": 0.29, "calib/std_conf": 0.4173727350941841, "calib/step_conf_rate": 0.90625, "calib/step_q_w": 0.5264308, "calib/step_q_w_n": 250.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 198.97265625, "completions/mean_terminated_length": 198.97265625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.1632, "grad_norm": 0.005113726481795311, "kl": 0.2578277587890625, "learning_rate": 1.3055555555555556e-06, "loss": -0.0499, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0007983995601534843, "mask/share_reasoning": 0.9040226936340332, "mask/share_step_conf": 0.09517890214920044, "num_tokens": 30710709.0, "reward": 0.002732241991907358, "reward_std": 0.00772794708609581, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.00390625, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -4.266354608262191e-06, "step": 153 }, { "adv/mean_abs_final_conf": 0.11585914343595505, "adv/mean_abs_reasoning": 0.13501739501953125, "adv/mean_abs_step_conf": 0.1158389151096344, "adv/ratio_final_to_reasoning": 0.8581053087211109, "adv/ratio_step_to_reasoning": 0.8579554885715093, "adv/std_final_conf": 0.4053131937980652, "adv/std_reasoning": 0.4372970759868622, "adv/std_step_conf": 0.40524232387542725, "calib/answer_extract_rate": 0.03125, "calib/auroc": 0.7, "calib/avg_num_step_conf": 0.9296875, "calib/ece": 0.44999999999999996, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.125, "calib/gap": 0.11599999999999999, "calib/mean_conf": 0.6675, "calib/mu_c": 0.7399999999999999, "calib/mu_w": 0.6239999999999999, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.91796875, "calib/nonempty_step_conf_rate": 0.91015625, "calib/pce": 0.37124999999999997, "calib/std_conf": 0.2963001012487171, "calib/step_conf_rate": 0.91015625, "calib/step_q_c": 0.5, "calib/step_q_c_n": 2.0, "calib/step_q_gap": -0.022537288135593192, "calib/step_q_w": 0.5225372881355932, "calib/step_q_w_n": 236.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 202.93359375, "completions/mean_terminated_length": 202.93359375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.16426666666666667, "grad_norm": 0.059825729578733444, "kl": 0.283966064453125, "learning_rate": 1.2777777777777779e-06, "loss": -0.4398, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.0027232125867158175, "mask/share_reasoning": 0.8968022465705872, "mask/share_step_conf": 0.1004745364189148, "num_tokens": 30867100.0, "reward": 0.005700921639800072, "reward_std": 0.021998731419444084, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.015060937032103539, "rewards/format_reward_step": 0.0234375, "rewards/step_l1_reward": -0.010690344497561455, "step": 154 }, { "adv/mean_abs_final_conf": 0.025226594880223274, "adv/mean_abs_reasoning": 0.044541243463754654, "adv/mean_abs_step_conf": 0.02512870728969574, "adv/ratio_final_to_reasoning": 0.5663648546487339, "adv/ratio_step_to_reasoning": 0.564167170369731, "adv/std_final_conf": 0.16532482206821442, "adv/std_reasoning": 0.23372870683670044, "adv/std_step_conf": 0.1656012386083603, "calib/answer_extract_rate": 0.015625, "calib/auroc": 0.0, "calib/avg_num_step_conf": 0.9296875, "calib/ece": 0.7775, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.41, "calib/mean_conf": 0.7475, "calib/mu_c": 0.44, "calib/mu_w": 0.85, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.9140625, "calib/nonempty_step_conf_rate": 0.90625, "calib/pce": 0.6375, "calib/std_conf": 0.18592673288153053, "calib/step_conf_rate": 0.90625, "calib/step_q_w": 0.5171634453781513, "calib/step_q_w_n": 238.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 151.33984375, "completions/mean_terminated_length": 151.933349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 17.0, "epoch": 0.16533333333333333, "grad_norm": 0.01697084680199623, "kl": 0.331878662109375, "learning_rate": 1.25e-06, "loss": -0.072, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0013132116291671991, "mask/share_reasoning": 0.8803766369819641, "mask/share_step_conf": 0.1144038736820221, "num_tokens": 31013059.0, "reward": -0.0014306087978184223, "reward_std": 0.004541726782917976, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0015542968176305294, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.0067592645063996315, "step": 155 }, { "adv/mean_abs_final_conf": 0.019319972023367882, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.019240034744143486, "adv/ratio_final_to_reasoning": 1.0020514735830872, "adv/ratio_step_to_reasoning": 0.9979054391921397, "adv/std_final_conf": 0.16555501520633698, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.164870023727417, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.9453125, "calib/ece": 0.5366666666666666, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/mean_conf": 0.5366666666666666, "calib/mu_c": NaN, "calib/mu_w": 0.5366666666666666, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.93359375, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.5366666666666666, "calib/std_conf": 0.3472111109333276, "calib/step_conf_rate": 0.92578125, "calib/step_q_w": 0.5298030303030303, "calib/step_q_w_n": 242.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 170.19921875, "completions/mean_terminated_length": 170.19921875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.1664, "grad_norm": 0.023405877873301506, "kl": 0.296722412109375, "learning_rate": 1.2222222222222223e-06, "loss": -0.0548, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0011736187152564526, "mask/share_reasoning": 0.8808482885360718, "mask/share_step_conf": 0.11797812581062317, "num_tokens": 31161390.0, "reward": 0.00159166450612247, "reward_std": 0.004501907154917717, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.002850000048056245, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.0004479209310375154, "step": 156 }, { "adv/mean_abs_final_conf": 0.03864702582359314, "adv/mean_abs_reasoning": 0.08310207724571228, "adv/mean_abs_step_conf": 0.038230299949645996, "adv/ratio_final_to_reasoning": 0.46505487112341903, "adv/ratio_step_to_reasoning": 0.46004024468135085, "adv/std_final_conf": 0.23417307436466217, "adv/std_reasoning": 0.3304872214794159, "adv/std_step_conf": 0.23165775835514069, "calib/answer_extract_rate": 0.0234375, "calib/auroc": 0.6666666666666666, "calib/avg_num_step_conf": 0.90625, "calib/ece": 0.398, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.17666666666666664, "calib/mean_conf": 0.306, "calib/mu_c": 0.37666666666666665, "calib/mu_w": 0.2, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.90234375, "calib/nonempty_step_conf_rate": 0.88671875, "calib/pce": 0.052000000000000005, "calib/std_conf": 0.17884071124886525, "calib/step_conf_rate": 0.88671875, "calib/step_q_w": 0.546862643678161, "calib/step_q_w_n": 232.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 165.22265625, "completions/mean_terminated_length": 165.22265625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.16746666666666668, "grad_norm": 0.02955986186861992, "kl": 0.29241943359375, "learning_rate": 1.1944444444444446e-06, "loss": -0.1381, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.0010385285131633282, "mask/share_reasoning": 0.880927324295044, "mask/share_step_conf": 0.11803416907787323, "num_tokens": 31307415.0, "reward": 0.005156665109097958, "reward_std": 0.01382213644683361, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.007471875287592411, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.0010647946037352085, "step": 157 }, { "adv/mean_abs_final_conf": 0.05792781710624695, "adv/mean_abs_reasoning": 0.07714889943599701, "adv/mean_abs_step_conf": 0.057842917740345, "adv/ratio_final_to_reasoning": 0.750857336005215, "adv/ratio_step_to_reasoning": 0.7497568748641927, "adv/std_final_conf": 0.28659093379974365, "adv/std_reasoning": 0.33054885268211365, "adv/std_step_conf": 0.28617167472839355, "calib/answer_extract_rate": 0.03125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 0.9765625, "calib/ece": 0.5442857142857144, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.14285714285714285, "calib/gap": -0.03500000000000003, "calib/mean_conf": 0.5700000000000001, "calib/mu_c": 0.545, "calib/mu_w": 0.5800000000000001, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.95703125, "calib/nonempty_step_conf_rate": 0.9375, "calib/pce": 0.41428571428571426, "calib/std_conf": 0.2770250117640231, "calib/step_conf_rate": 0.9375, "calib/step_q_c": 0.58, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.04874899598393567, "calib/step_q_w": 0.5312510040160643, "calib/step_q_w_n": 249.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1476.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 171.08984375, "completions/mean_terminated_length": 171.08984375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.16853333333333334, "grad_norm": 0.044954050332307816, "kl": 0.348052978515625, "learning_rate": 1.1666666666666668e-06, "loss": -0.1857, "mask/has_final_conf_rate": 0.02734375, "mask/share_final_conf": 0.0017155336681753397, "mask/share_reasoning": 0.885796070098877, "mask/share_step_conf": 0.11248837411403656, "num_tokens": 31456454.0, "reward": 0.0025267673190683126, "reward_std": 0.009554600343108177, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.005995702929794788, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.004848418291658163, "step": 158 }, { "adv/mean_abs_final_conf": 0.038645416498184204, "adv/mean_abs_reasoning": 0.03858806565403938, "adv/mean_abs_step_conf": 0.03852042183279991, "adv/ratio_final_to_reasoning": 1.0014862326777143, "adv/ratio_step_to_reasoning": 0.9982470274139696, "adv/std_final_conf": 0.234163299202919, "adv/std_reasoning": 0.23381583392620087, "adv/std_step_conf": 0.23340687155723572, "calib/answer_extract_rate": 0.01171875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 0.96484375, "calib/ece": 0.35, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.25500000000000006, "calib/mean_conf": 0.38999999999999996, "calib/mu_c": 0.56, "calib/mu_w": 0.305, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.92578125, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.2033333333333333, "calib/std_conf": 0.26919633479426625, "calib/step_conf_rate": 0.921875, "calib/step_q_c": 1.0, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.44775632653061226, "calib/step_q_w": 0.5522436734693877, "calib/step_q_w_n": 245.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1068.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 167.48828125, "completions/mean_terminated_length": 167.48828125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1696, "grad_norm": 0.029850637540221214, "kl": 0.308319091796875, "learning_rate": 1.138888888888889e-06, "loss": -0.13, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0015966113423928618, "mask/share_reasoning": 0.8897839784622192, "mask/share_step_conf": 0.10861947387456894, "num_tokens": 31604115.0, "reward": 0.0034665153361856937, "reward_std": 0.009804786182940006, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0070558590814471245, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.0024665785022079945, "step": 159 }, { "adv/mean_abs_final_conf": 0.05790805071592331, "adv/mean_abs_reasoning": 0.09645655006170273, "adv/mean_abs_step_conf": 0.05781654268503189, "adv/ratio_final_to_reasoning": 0.6003537414398488, "adv/ratio_step_to_reasoning": 0.5994050445308998, "adv/std_final_conf": 0.28649330139160156, "adv/std_reasoning": 0.36964312195777893, "adv/std_step_conf": 0.28604114055633545, "calib/answer_extract_rate": 0.0234375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 0.95703125, "calib/ece": 0.5, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.16666666666666666, "calib/gap": -0.015000000000000013, "calib/mean_conf": 0.52, "calib/mu_c": 0.515, "calib/mu_w": 0.53, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.95703125, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.17666666666666667, "calib/std_conf": 0.26695817400234567, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.645, "calib/step_q_c_n": 2.0, "calib/step_q_gap": 0.14981111111111106, "calib/step_q_w": 0.49518888888888896, "calib/step_q_w_n": 243.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1638.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 178.32421875, "completions/mean_terminated_length": 178.32421875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.17066666666666666, "grad_norm": 0.03517092391848564, "kl": 0.298797607421875, "learning_rate": 1.111111111111111e-06, "loss": -0.2384, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0011997153051197529, "mask/share_reasoning": 0.8991638422012329, "mask/share_step_conf": 0.09963646531105042, "num_tokens": 31754606.0, "reward": 0.003261414123699069, "reward_std": 0.01592809334397316, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.0059742191806435585, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.004920140374451876, "step": 160 }, { "adv/mean_abs_final_conf": 0.0193235632032156, "adv/mean_abs_reasoning": 0.06382165849208832, "adv/mean_abs_step_conf": 0.019290367141366005, "adv/ratio_final_to_reasoning": 0.3027743819225734, "adv/ratio_step_to_reasoning": 0.3022542440472202, "adv/std_final_conf": 0.16558578610420227, "adv/std_reasoning": 0.2862262427806854, "adv/std_step_conf": 0.16530132293701172, "calib/answer_extract_rate": 0.015625, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.9296875, "calib/ece": 0.20666666666666667, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.39000000000000007, "calib/mean_conf": 0.46, "calib/mu_c": 0.5900000000000001, "calib/mu_w": 0.2, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.0, "calib/std_conf": 0.31283648551066845, "calib/step_conf_rate": 0.91796875, "calib/step_q_w": 0.5152964985994398, "calib/step_q_w_n": 238.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 173.3828125, "completions/mean_terminated_length": 173.3828125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.17173333333333332, "grad_norm": 0.020821698009967804, "kl": 0.2965240478515625, "learning_rate": 1.0833333333333335e-06, "loss": -0.0847, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.00027795127243734896, "mask/share_reasoning": 0.8859891891479492, "mask/share_step_conf": 0.11373290419578552, "num_tokens": 31902912.0, "reward": 0.0029594521038234234, "reward_std": 0.007607479579746723, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.0037499999161809683, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.000956095929723233, "step": 161 }, { "adv/mean_abs_final_conf": 0.038639381527900696, "adv/mean_abs_reasoning": 0.05784125626087189, "adv/mean_abs_step_conf": 0.038621000945568085, "adv/ratio_final_to_reasoning": 0.6680245905039106, "adv/ratio_step_to_reasoning": 0.6677068141705317, "adv/std_final_conf": 0.2341267615556717, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.23401540517807007, "calib/answer_extract_rate": 0.0234375, "calib/auroc": 0.8, "calib/avg_num_step_conf": 0.95703125, "calib/ece": 0.30833333333333335, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.238, "calib/mean_conf": 0.3116666666666667, "calib/mu_c": 0.51, "calib/mu_w": 0.272, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.9375, "calib/pce": 0.22666666666666668, "calib/std_conf": 0.21145658866275338, "calib/step_conf_rate": 0.9375, "calib/step_q_w": 0.560511700680272, "calib/step_q_w_n": 245.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 153.421875, "completions/mean_terminated_length": 154.02354431152344, "completions/min_length": 0.0, "completions/min_terminated_length": 23.0, "epoch": 0.1728, "grad_norm": 0.0327790267765522, "kl": 0.328704833984375, "learning_rate": 1.0555555555555557e-06, "loss": -0.1476, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.001432921038940549, "mask/share_reasoning": 0.8785004019737244, "mask/share_step_conf": 0.11616045236587524, "num_tokens": 32046332.0, "reward": 0.001956491032615304, "reward_std": 0.0058908602222800255, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.005937109235674143, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.00436787772923708, "step": 162 }, { "adv/mean_abs_final_conf": 0.057446904480457306, "adv/mean_abs_reasoning": 0.1024097204208374, "adv/mean_abs_step_conf": 0.05789678916335106, "adv/ratio_final_to_reasoning": 0.5609516776765707, "adv/ratio_step_to_reasoning": 0.5653446657742339, "adv/std_final_conf": 0.2842351794242859, "adv/std_reasoning": 0.3695880174636841, "adv/std_step_conf": 0.2864377796649933, "calib/answer_extract_rate": 0.02734375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 0.953125, "calib/ece": 0.55975, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.25, "calib/gap": -0.11949999999999994, "calib/mean_conf": 0.45974999999999994, "calib/mu_c": 0.4, "calib/mu_w": 0.5195, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.94921875, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.25975, "calib/std_conf": 0.3782263707093941, "calib/step_conf_rate": 0.93359375, "calib/step_q_c": 0.08, "calib/step_q_c_n": 1.0, "calib/step_q_gap": -0.44900109739369004, "calib/step_q_w": 0.5290010973936901, "calib/step_q_w_n": 243.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 155.38671875, "completions/mean_terminated_length": 155.38671875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.17386666666666667, "grad_norm": 0.06696376949548721, "kl": 0.336578369140625, "learning_rate": 1.0277777777777777e-06, "loss": -0.2386, "mask/has_final_conf_rate": 0.015625, "mask/share_final_conf": 0.0017463011899963021, "mask/share_reasoning": 0.8830251693725586, "mask/share_step_conf": 0.11522849649190903, "num_tokens": 32190943.0, "reward": 0.00286776851862669, "reward_std": 0.007197332568466663, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.007374605629593134, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.007107818499207497, "step": 163 }, { "adv/mean_abs_final_conf": 0.03864771127700806, "adv/mean_abs_reasoning": 0.04287446290254593, "adv/mean_abs_step_conf": 0.03591509163379669, "adv/ratio_final_to_reasoning": 0.9014156367358975, "adv/ratio_step_to_reasoning": 0.837680269381614, "adv/std_final_conf": 0.23417723178863525, "adv/std_reasoning": 0.23382404446601868, "adv/std_step_conf": 0.21824036538600922, "calib/answer_extract_rate": 0.0234375, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.96875, "calib/ece": 0.2386666666666667, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.4, "calib/gap": 0.5972222222222222, "calib/mean_conf": 0.6266666666666667, "calib/mu_c": 0.985, "calib/mu_w": 0.3877777777777778, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.2326666666666667, "calib/std_conf": 0.3324321150417196, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.99, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.4862210526315789, "calib/step_q_w": 0.5037789473684211, "calib/step_q_w_n": 247.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 160.5, "completions/mean_terminated_length": 160.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17493333333333333, "grad_norm": 0.02655690908432007, "kl": 0.32733154296875, "learning_rate": 1.0000000000000002e-06, "loss": -0.1412, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.0013986306730657816, "mask/share_reasoning": 0.8904591798782349, "mask/share_step_conf": 0.1081421747803688, "num_tokens": 32338167.0, "reward": 0.004336560145020485, "reward_std": 0.011081664822995663, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.007699218578636646, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.002151099033653736, "step": 164 }, { "adv/mean_abs_final_conf": 0.057930298149585724, "adv/mean_abs_reasoning": 0.07717613130807877, "adv/mean_abs_step_conf": 0.057131461799144745, "adv/ratio_final_to_reasoning": 0.7506245411335046, "adv/ratio_step_to_reasoning": 0.7402737197473935, "adv/std_final_conf": 0.28660330176353455, "adv/std_reasoning": 0.3306655287742615, "adv/std_step_conf": 0.2827037274837494, "calib/answer_extract_rate": 0.02734375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 0.93359375, "calib/ece": 0.37, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.2, "calib/gap": 0.10833333333333328, "calib/mean_conf": 0.43, "calib/mu_c": 0.495, "calib/mu_w": 0.3866666666666667, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.9375, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.2, "calib/std_conf": 0.38481164223552283, "calib/step_conf_rate": 0.921875, "calib/step_q_c": 0.545, "calib/step_q_c_n": 2.0, "calib/step_q_gap": 0.04295541490857946, "calib/step_q_w": 0.5020445850914206, "calib/step_q_w_n": 237.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 184.875, "completions/mean_terminated_length": 184.875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.176, "grad_norm": 0.04415847733616829, "kl": 0.28472900390625, "learning_rate": 9.722222222222224e-07, "loss": -0.2662, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.0005913099739700556, "mask/share_reasoning": 0.8977833986282349, "mask/share_step_conf": 0.10162527859210968, "num_tokens": 32491071.0, "reward": 0.004669106099754572, "reward_std": 0.014850882813334465, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.008543359115719795, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.0038926471024751663, "step": 165 }, { "adv/mean_abs_final_conf": 0.1159270703792572, "adv/mean_abs_reasoning": 0.15429779887199402, "adv/mean_abs_step_conf": 0.11581672728061676, "adv/ratio_final_to_reasoning": 0.7513203119341365, "adv/ratio_step_to_reasoning": 0.7506051811970352, "adv/std_final_conf": 0.40555065870285034, "adv/std_reasoning": 0.46746668219566345, "adv/std_step_conf": 0.4051646888256073, "calib/answer_extract_rate": 0.03515625, "calib/auroc": 0.75, "calib/avg_num_step_conf": 0.90234375, "calib/ece": 0.38999999999999996, "calib/final_conf_rate": 0.03515625, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.1111111111111111, "calib/gap": 0.22449999999999992, "calib/mean_conf": 0.5077777777777777, "calib/mu_c": 0.6325, "calib/mu_w": 0.40800000000000003, "calib/nonempty_final_conf_rate": 0.03515625, "calib/nonempty_reasoning_rate": 0.89453125, "calib/nonempty_step_conf_rate": 0.8828125, "calib/pce": 0.22666666666666666, "calib/std_conf": 0.26080123456497917, "calib/step_conf_rate": 0.8828125, "calib/step_q_c": 0.73, "calib/step_q_c_n": 2.0, "calib/step_q_gap": 0.16625196506550222, "calib/step_q_w": 0.5637480349344978, "calib/step_q_w_n": 229.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 199.70703125, "completions/mean_terminated_length": 200.49020385742188, "completions/min_length": 0.0, "completions/min_terminated_length": 28.0, "epoch": 0.17706666666666668, "grad_norm": 0.06651771813631058, "kl": 0.244354248046875, "learning_rate": 9.444444444444445e-07, "loss": -0.3756, "mask/has_final_conf_rate": 0.03515625, "mask/share_final_conf": 0.0025812385138124228, "mask/share_reasoning": 0.8953035473823547, "mask/share_step_conf": 0.0982089564204216, "num_tokens": 32648380.0, "reward": 0.008751096203923225, "reward_std": 0.02475183829665184, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.01907304674386978, "rewards/format_reward_step": 0.0234375, "rewards/step_l1_reward": -0.00938335433602333, "step": 166 }, { "adv/mean_abs_final_conf": 0.057971835136413574, "adv/mean_abs_reasoning": 0.06382165849208832, "adv/mean_abs_step_conf": 0.057953860610723495, "adv/ratio_final_to_reasoning": 0.9083410946395272, "adv/ratio_step_to_reasoning": 0.9080594578705249, "adv/std_final_conf": 0.28680866956710815, "adv/std_reasoning": 0.2862262427806854, "adv/std_step_conf": 0.28671973943710327, "calib/answer_extract_rate": 0.0234375, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.9140625, "calib/ece": 0.07853333333333333, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.16666666666666666, "calib/gap": 0.89776, "calib/mean_conf": 0.2418666666666667, "calib/mu_c": 0.99, "calib/mu_w": 0.09224000000000002, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.92578125, "calib/nonempty_step_conf_rate": 0.9140625, "calib/pce": 0.07686666666666667, "calib/std_conf": 0.3421813684128475, "calib/step_conf_rate": 0.9140625, "calib/step_q_w": 0.557118660968661, "calib/step_q_w_n": 234.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1585.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 188.90625, "completions/mean_terminated_length": 188.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.17813333333333334, "grad_norm": 0.04468516632914543, "kl": 0.289154052734375, "learning_rate": 9.166666666666666e-07, "loss": -0.207, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0017223083414137363, "mask/share_reasoning": 0.9020618200302124, "mask/share_step_conf": 0.0962158814072609, "num_tokens": 32802348.0, "reward": 0.003114057704806328, "reward_std": 0.008013888262212276, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.011639062315225601, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.00853594671934843, "step": 167 }, { "adv/mean_abs_final_conf": 0.038645610213279724, "adv/mean_abs_reasoning": 0.10238249599933624, "adv/mean_abs_step_conf": 0.03857024013996124, "adv/ratio_final_to_reasoning": 0.37746305983329675, "adv/ratio_step_to_reasoning": 0.37672689812339893, "adv/std_final_conf": 0.23416449129581451, "adv/std_reasoning": 0.36948361992836, "adv/std_step_conf": 0.23370780050754547, "calib/answer_extract_rate": 0.02734375, "calib/auroc": 0.6666666666666667, "calib/avg_num_step_conf": 0.93359375, "calib/ece": 0.2557142857142858, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.42857142857142855, "calib/gap": 0.35250000000000004, "calib/mean_conf": 0.7014285714285713, "calib/mu_c": 0.8525, "calib/mu_w": 0.5, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.94921875, "calib/nonempty_step_conf_rate": 0.9296875, "calib/pce": 0.1928571428571429, "calib/std_conf": 0.3081942138804487, "calib/step_conf_rate": 0.9296875, "calib/step_q_w": 0.5231748953974896, "calib/step_q_w_n": 239.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 174.734375, "completions/mean_terminated_length": 175.41961669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 33.0, "epoch": 0.1792, "grad_norm": 0.032676585018634796, "kl": 0.303375244140625, "learning_rate": 8.88888888888889e-07, "loss": -0.1472, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0014493621420115232, "mask/share_reasoning": 0.895774781703949, "mask/share_step_conf": 0.09886964410543442, "num_tokens": 32951752.0, "reward": 0.005033358000218868, "reward_std": 0.013473371975123882, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.0070917969569563866, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.0017125809099525213, "step": 168 }, { "adv/mean_abs_final_conf": 0.06294244527816772, "adv/mean_abs_reasoning": 0.10074294358491898, "adv/mean_abs_step_conf": 0.06367062032222748, "adv/ratio_final_to_reasoning": 0.6247826700151144, "adv/ratio_step_to_reasoning": 0.6320107201211346, "adv/std_final_conf": 0.28657039999961853, "adv/std_reasoning": 0.3696483075618744, "adv/std_step_conf": 0.2865627706050873, "calib/answer_extract_rate": 0.03515625, "calib/auroc": 0.45833333333333337, "calib/avg_num_step_conf": 0.9296875, "calib/ece": 0.505, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.06666666666666665, "calib/mean_conf": 0.525, "calib/mu_c": 0.485, "calib/mu_w": 0.5516666666666666, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.90625, "calib/nonempty_step_conf_rate": 0.88671875, "calib/pce": 0.315, "calib/std_conf": 0.29282247181526216, "calib/step_conf_rate": 0.88671875, "calib/step_q_c": 0.525, "calib/step_q_c_n": 2.0, "calib/step_q_gap": -0.007378389830508425, "calib/step_q_w": 0.5323783898305084, "calib/step_q_w_n": 236.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 201.8359375, "completions/mean_terminated_length": 202.62745666503906, "completions/min_length": 0.0, "completions/min_terminated_length": 26.0, "epoch": 0.18026666666666666, "grad_norm": 0.031436190009117126, "kl": 0.26751708984375, "learning_rate": 8.611111111111112e-07, "loss": -0.2209, "mask/has_final_conf_rate": 0.0390625, "mask/share_final_conf": 0.0018767904257401824, "mask/share_reasoning": 0.8958785533905029, "mask/share_step_conf": 0.09833839535713196, "num_tokens": 33107606.0, "reward": 0.005335357505828142, "reward_std": 0.012339570559561253, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.010004688054323196, "rewards/format_reward_step": 0.015625, "rewards/step_l1_reward": -0.005583972670137882, "step": 169 }, { "adv/mean_abs_final_conf": 0.03864695504307747, "adv/mean_abs_reasoning": 0.07714889943599701, "adv/mean_abs_step_conf": 0.03758513927459717, "adv/ratio_final_to_reasoning": 0.5009398102320192, "adv/ratio_step_to_reasoning": 0.4871766097684638, "adv/std_final_conf": 0.23417262732982635, "adv/std_reasoning": 0.33054885268211365, "adv/std_step_conf": 0.22782209515571594, "calib/answer_extract_rate": 0.0234375, "calib/auroc": 0.6666666666666666, "calib/avg_num_step_conf": 0.94921875, "calib/ece": 0.4, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.2857142857142857, "calib/gap": 0.03499999999999992, "calib/mean_conf": 0.5599999999999999, "calib/mu_c": 0.58, "calib/mu_w": 0.545, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.94140625, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.2657142857142857, "calib/std_conf": 0.28784916685156975, "calib/step_conf_rate": 0.92578125, "calib/step_q_c": 0.99, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.462343153526971, "calib/step_q_w": 0.527656846473029, "calib/step_q_w_n": 241.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1106.0, "completions/max_terminated_length": 1106.0, "completions/mean_length": 165.04296875, "completions/mean_terminated_length": 165.04296875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.18133333333333335, "grad_norm": 0.03978708013892174, "kl": 0.311492919921875, "learning_rate": 8.333333333333333e-07, "loss": -0.1579, "mask/has_final_conf_rate": 0.02734375, "mask/share_final_conf": 0.0013750765938311815, "mask/share_reasoning": 0.8882984519004822, "mask/share_step_conf": 0.11032651364803314, "num_tokens": 33254009.0, "reward": 0.005037762224674225, "reward_std": 0.014248941093683243, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.007459375075995922, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.0012901013251394033, "step": 170 }, { "adv/mean_abs_final_conf": 0.057969287037849426, "adv/mean_abs_reasoning": 0.05784125626087189, "adv/mean_abs_step_conf": 0.05761338770389557, "adv/ratio_final_to_reasoning": 1.0022134854125593, "adv/ratio_step_to_reasoning": 0.9960604493797887, "adv/std_final_conf": 0.28679606318473816, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.2850436270236969, "calib/answer_extract_rate": 0.01953125, "calib/avg_num_step_conf": 0.92578125, "calib/ece": 0.3583333333333333, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.35833333333333334, "calib/mu_c": NaN, "calib/mu_w": 0.35833333333333334, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.93359375, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.3583333333333333, "calib/std_conf": 0.19186945793661087, "calib/step_conf_rate": 0.92578125, "calib/step_q_w": 0.5368689170182841, "calib/step_q_w_n": 237.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 166.6640625, "completions/mean_terminated_length": 166.6640625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1824, "grad_norm": 0.04051095247268677, "kl": 0.2969970703125, "learning_rate": 8.055555555555557e-07, "loss": -0.2421, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0008004299597814679, "mask/share_reasoning": 0.8912744522094727, "mask/share_step_conf": 0.10792511701583862, "num_tokens": 33403571.0, "reward": 0.004929243121296167, "reward_std": 0.01394200511276722, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.010848437435925007, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.0033337008208036423, "step": 171 }, { "adv/mean_abs_final_conf": 0.0579204298555851, "adv/mean_abs_reasoning": 0.11568251252174377, "adv/mean_abs_step_conf": 0.05779913812875748, "adv/ratio_final_to_reasoning": 0.5006844041764595, "adv/ratio_step_to_reasoning": 0.49963591617093817, "adv/std_final_conf": 0.2865545153617859, "adv/std_reasoning": 0.4046950340270996, "adv/std_step_conf": 0.28595492243766785, "calib/answer_extract_rate": 0.02734375, "calib/auroc": 0.75, "calib/avg_num_step_conf": 0.9453125, "calib/ece": 0.3814285714285714, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.14285714285714285, "calib/gap": 0.22333333333333333, "calib/mean_conf": 0.5157142857142857, "calib/mu_c": 0.6433333333333333, "calib/mu_w": 0.42, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.23428571428571426, "calib/std_conf": 0.3290617459582034, "calib/step_conf_rate": 0.9453125, "calib/step_q_w": 0.5292363636363636, "calib/step_q_w_n": 242.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 165.16015625, "completions/mean_terminated_length": 165.80784606933594, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.18346666666666667, "grad_norm": 0.05485696345567703, "kl": 0.28277587890625, "learning_rate": 7.777777777777779e-07, "loss": -0.2087, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0016495675081387162, "mask/share_reasoning": 0.8820877075195312, "mask/share_step_conf": 0.11235648393630981, "num_tokens": 33549204.0, "reward": 0.004013527184724808, "reward_std": 0.011351969093084335, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.007190625183284283, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.003851071000099182, "step": 172 }, { "adv/mean_abs_final_conf": 0.03862863779067993, "adv/mean_abs_reasoning": 0.06382165849208832, "adv/mean_abs_step_conf": 0.03851080313324928, "adv/ratio_final_to_reasoning": 0.6052590719726995, "adv/ratio_step_to_reasoning": 0.6034127605446558, "adv/std_final_conf": 0.23406165838241577, "adv/std_reasoning": 0.2862262427806854, "adv/std_step_conf": 0.23334872722625732, "calib/answer_extract_rate": 0.01953125, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.921875, "calib/ece": 0.20400000000000004, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.4, "calib/gap": 0.6566666666666667, "calib/mean_conf": 0.5960000000000001, "calib/mu_c": 0.99, "calib/mu_w": 0.3333333333333333, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.91015625, "calib/pce": 0.20000000000000004, "calib/std_conf": 0.41572106032771544, "calib/step_conf_rate": 0.91015625, "calib/step_q_w": 0.5427545197740112, "calib/step_q_w_n": 236.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1437.0, "completions/max_terminated_length": 1437.0, "completions/mean_length": 180.21875, "completions/mean_terminated_length": 180.21875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.18453333333333333, "grad_norm": 0.04185887426137924, "kl": 0.29754638671875, "learning_rate": 7.5e-07, "loss": -0.1567, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.001149651245214045, "mask/share_reasoning": 0.8883706331253052, "mask/share_step_conf": 0.1104796975851059, "num_tokens": 33698500.0, "reward": 0.0030319697689265013, "reward_std": 0.007411236874759197, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.0053125000558793545, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.0023735607974231243, "step": 173 }, { "adv/mean_abs_final_conf": 0.038619525730609894, "adv/mean_abs_reasoning": 0.05784125626087189, "adv/mean_abs_step_conf": 0.0386103130877018, "adv/ratio_final_to_reasoning": 0.667681309624926, "adv/ratio_step_to_reasoning": 0.6675220350257274, "adv/std_final_conf": 0.23400649428367615, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.23395071923732758, "calib/answer_extract_rate": 0.015625, "calib/auroc": 0.6666666666666667, "calib/avg_num_step_conf": 0.9453125, "calib/ece": 0.31999999999999995, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.3533333333333334, "calib/mean_conf": 0.405, "calib/mu_c": 0.67, "calib/mu_w": 0.31666666666666665, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.9140625, "calib/pce": 0.2375, "calib/std_conf": 0.3623879137057416, "calib/step_conf_rate": 0.9140625, "calib/step_q_w": 0.5508133608815428, "calib/step_q_w_n": 242.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 181.86328125, "completions/mean_terminated_length": 182.57647705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 27.0, "epoch": 0.1856, "grad_norm": 0.03075559437274933, "kl": 0.2668304443359375, "learning_rate": 7.222222222222222e-07, "loss": -0.1588, "mask/has_final_conf_rate": 0.015625, "mask/share_final_conf": 0.0006882546003907919, "mask/share_reasoning": 0.8974344730377197, "mask/share_step_conf": 0.09797105193138123, "num_tokens": 33849289.0, "reward": 0.0014810014981776476, "reward_std": 0.0041889045387506485, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.004951172042638063, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.004332918673753738, "step": 174 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.01171875, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.88671875, "calib/ece": 0.43999999999999995, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.2400000000000001, "calib/mean_conf": 0.64, "calib/mu_c": 0.8, "calib/mu_w": 0.5599999999999999, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.8984375, "calib/nonempty_step_conf_rate": 0.88671875, "calib/pce": 0.3733333333333333, "calib/std_conf": 0.16083117442419761, "calib/step_conf_rate": 0.88671875, "calib/step_q_w": 0.46687679882525696, "calib/step_q_w_n": 227.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2529.0, "completions/max_terminated_length": 2529.0, "completions/mean_length": 188.6875, "completions/mean_terminated_length": 188.6875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.18666666666666668, "grad_norm": 0.003241309430450201, "kl": 0.2762603759765625, "learning_rate": 6.944444444444446e-07, "loss": -0.0061, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.00039259361801669, "mask/share_reasoning": 0.8896238207817078, "mask/share_step_conf": 0.10998360067605972, "num_tokens": 34003417.0, "reward": 0.0003906250058207661, "reward_std": 0.001104854280129075, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 175 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 0.88671875, "calib/ece": 0.42166666666666663, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.4216666666666667, "calib/mu_c": NaN, "calib/mu_w": 0.4216666666666667, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.90625, "calib/nonempty_step_conf_rate": 0.8828125, "calib/pce": 0.42166666666666663, "calib/std_conf": 0.34503220461606515, "calib/step_conf_rate": 0.8828125, "calib/step_q_w": 0.5479162995594714, "calib/step_q_w_n": 227.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3017.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 209.3828125, "completions/mean_terminated_length": 209.3828125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.18773333333333334, "grad_norm": 0.002791334642097354, "kl": 0.24755859375, "learning_rate": 6.666666666666667e-07, "loss": 0.0212, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0006819250411354005, "mask/share_reasoning": 0.9036825299263, "mask/share_step_conf": 0.09563553333282471, "num_tokens": 34161083.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 176 }, { "adv/mean_abs_final_conf": 0.019307058304548264, "adv/mean_abs_reasoning": 0.03858806565403938, "adv/mean_abs_step_conf": 0.019321506842970848, "adv/ratio_final_to_reasoning": 0.500337551968667, "adv/ratio_step_to_reasoning": 0.5007119822018931, "adv/std_final_conf": 0.16544435918331146, "adv/std_reasoning": 0.23381584882736206, "adv/std_step_conf": 0.16556817293167114, "calib/answer_extract_rate": 0.01171875, "calib/auroc": 0.0, "calib/avg_num_step_conf": 0.9375, "calib/ece": 0.765, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.53, "calib/mean_conf": 0.485, "calib/mu_c": 0.22, "calib/mu_w": 0.75, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.93359375, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.375, "calib/std_conf": 0.265, "calib/step_conf_rate": 0.92578125, "calib/step_q_c": 0.14, "calib/step_q_c_n": 1.0, "calib/step_q_gap": -0.3586164574616457, "calib/step_q_w": 0.49861645746164573, "calib/step_q_w_n": 239.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 168.48828125, "completions/mean_terminated_length": 168.48828125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.1888, "grad_norm": 0.024655012413859367, "kl": 0.273468017578125, "learning_rate": 6.388888888888889e-07, "loss": -0.0691, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.0005021016113460064, "mask/share_reasoning": 0.8795663118362427, "mask/share_step_conf": 0.11993157863616943, "num_tokens": 34308048.0, "reward": 0.0003487913345452398, "reward_std": 0.0012231777654960752, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.00152968754991889, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.003175854915753007, "step": 177 }, { "adv/mean_abs_final_conf": 0.03864792734384537, "adv/mean_abs_reasoning": 0.057868484407663345, "adv/mean_abs_step_conf": 0.03859715908765793, "adv/ratio_final_to_reasoning": 0.6678579496152718, "adv/ratio_step_to_reasoning": 0.6669806455575088, "adv/std_final_conf": 0.23417851328849792, "adv/std_reasoning": 0.28629738092422485, "adv/std_step_conf": 0.23387092351913452, "calib/answer_extract_rate": 0.02734375, "calib/auroc": 1.0, "calib/avg_num_step_conf": 1.01953125, "calib/ece": 0.14600000000000002, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.2, "calib/gap": 0.81, "calib/mean_conf": 0.34199999999999997, "calib/mu_c": 0.99, "calib/mu_w": 0.18, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.14400000000000002, "calib/std_conf": 0.3584633872517527, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.71, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.2291955128205127, "calib/step_q_w": 0.48080448717948726, "calib/step_q_w_n": 260.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 163.58984375, "completions/mean_terminated_length": 164.23138427734375, "completions/min_length": 0.0, "completions/min_terminated_length": 25.0, "epoch": 0.18986666666666666, "grad_norm": 0.04221513122320175, "kl": 0.303070068359375, "learning_rate": 6.111111111111112e-07, "loss": -0.1459, "mask/has_final_conf_rate": 0.015625, "mask/share_final_conf": 0.0008479391690343618, "mask/share_reasoning": 0.8811119794845581, "mask/share_step_conf": 0.11413383483886719, "num_tokens": 34455999.0, "reward": 0.004273958504199982, "reward_std": 0.012088580057024956, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.007773046847432852, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.0023501296527683735, "step": 178 }, { "adv/mean_abs_final_conf": 0.019323695451021194, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.01888473890721798, "adv/ratio_final_to_reasoning": 1.0022445932295478, "adv/ratio_step_to_reasoning": 0.9794776321270714, "adv/std_final_conf": 0.165586918592453, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.16182544827461243, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.93359375, "calib/ece": 0.17, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.17, "calib/mu_c": NaN, "calib/mu_w": 0.17, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.92578125, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.17, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.92578125, "calib/step_q_w": 0.5037097629009764, "calib/step_q_w_n": 239.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2128.0, "completions/max_terminated_length": 2128.0, "completions/mean_length": 157.078125, "completions/mean_terminated_length": 157.078125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.19093333333333334, "grad_norm": 0.03157714381814003, "kl": 0.33184814453125, "learning_rate": 5.833333333333334e-07, "loss": -0.0458, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.0005696614389307797, "mask/share_reasoning": 0.896777331829071, "mask/share_step_conf": 0.10265299677848816, "num_tokens": 34602475.0, "reward": 0.0022409602534025908, "reward_std": 0.006338392850011587, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0037933592684566975, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -9.268872963730246e-05, "step": 179 }, { "adv/mean_abs_final_conf": 0.038644008338451385, "adv/mean_abs_reasoning": 0.09640209376811981, "adv/mean_abs_step_conf": 0.03858642280101776, "adv/ratio_final_to_reasoning": 0.400862749219986, "adv/ratio_step_to_reasoning": 0.40026540184730197, "adv/std_final_conf": 0.23415479063987732, "adv/std_reasoning": 0.3694343566894531, "adv/std_step_conf": 0.233805850148201, "calib/answer_extract_rate": 0.03125, "calib/auroc": 0.5333333333333333, "calib/avg_num_step_conf": 0.87109375, "calib/ece": 0.51125, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.125, "calib/gap": -0.0020000000000000018, "calib/mean_conf": 0.54125, "calib/mu_c": 0.54, "calib/mu_w": 0.542, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.88671875, "calib/nonempty_step_conf_rate": 0.86328125, "calib/pce": 0.33875, "calib/std_conf": 0.30097497819586266, "calib/step_conf_rate": 0.86328125, "calib/step_q_w": 0.5032677130044844, "calib/step_q_w_n": 223.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2056.0, "completions/max_terminated_length": 2056.0, "completions/mean_length": 205.48828125, "completions/mean_terminated_length": 205.48828125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.192, "grad_norm": 0.034869104623794556, "kl": 0.2762451171875, "learning_rate": 5.555555555555555e-07, "loss": -0.1626, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.0018083257600665092, "mask/share_reasoning": 0.903313159942627, "mask/share_step_conf": 0.09487849473953247, "num_tokens": 34758936.0, "reward": 0.004252666607499123, "reward_std": 0.01202835701406002, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.006643359549343586, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.002044275403022766, "step": 180 }, { "adv/mean_abs_final_conf": 0.019241439178586006, "adv/mean_abs_reasoning": 0.038560837507247925, "adv/mean_abs_step_conf": 0.019322792068123817, "adv/ratio_final_to_reasoning": 0.49898914086005963, "adv/ratio_step_to_reasoning": 0.5010988691439051, "adv/std_final_conf": 0.16488207876682281, "adv/std_reasoning": 0.233650803565979, "adv/std_step_conf": 0.1655791848897934, "calib/answer_extract_rate": 0.03515625, "calib/auroc": 0.5714285714285714, "calib/avg_num_step_conf": 0.90625, "calib/ece": 0.465, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.125, "calib/gap": 0.18285714285714294, "calib/mean_conf": 0.5900000000000001, "calib/mu_c": 0.75, "calib/mu_w": 0.5671428571428571, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.90625, "calib/nonempty_step_conf_rate": 0.875, "calib/pce": 0.465, "calib/std_conf": 0.25446021299998944, "calib/step_conf_rate": 0.875, "calib/step_q_w": 0.5063456896551723, "calib/step_q_w_n": 232.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 198.19140625, "completions/mean_terminated_length": 198.19140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.19306666666666666, "grad_norm": 0.020284345373511314, "kl": 0.28765869140625, "learning_rate": 5.277777777777779e-07, "loss": -0.0618, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.0029326127842068672, "mask/share_reasoning": 0.8988545536994934, "mask/share_step_conf": 0.0982128456234932, "num_tokens": 34915937.0, "reward": -0.0007472168654203415, "reward_std": 0.004323157016187906, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0004546875134110451, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.0035116211511194706, "step": 181 }, { "adv/mean_abs_final_conf": 0.03859543055295944, "adv/mean_abs_reasoning": 0.05784125626087189, "adv/mean_abs_step_conf": 0.038640618324279785, "adv/ratio_final_to_reasoning": 0.6672647353800345, "adv/ratio_step_to_reasoning": 0.6680459731027516, "adv/std_final_conf": 0.2338605523109436, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.2341342270374298, "calib/answer_extract_rate": 0.01171875, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.96484375, "calib/ece": 0.4966666666666667, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": 0.24, "calib/mean_conf": 0.83, "calib/mu_c": 0.99, "calib/mu_w": 0.75, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.95703125, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.4966666666666667, "calib/std_conf": 0.16673332000533067, "calib/step_conf_rate": 0.953125, "calib/step_q_w": 0.5249087719298245, "calib/step_q_w_n": 247.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2943.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 186.1484375, "completions/mean_terminated_length": 186.1484375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.19413333333333332, "grad_norm": 0.044177763164043427, "kl": 0.3182373046875, "learning_rate": 5.000000000000001e-07, "loss": -0.1202, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0004936805926263332, "mask/share_reasoning": 0.8925653696060181, "mask/share_step_conf": 0.10694096982479095, "num_tokens": 35069751.0, "reward": -0.00030797565705142915, "reward_std": 0.005734951235353947, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0032421874348074198, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.006201888434588909, "step": 182 }, { "adv/mean_abs_final_conf": 0.019319282844662666, "adv/mean_abs_reasoning": 0.05784125626087189, "adv/mean_abs_step_conf": 0.019303763285279274, "adv/ratio_final_to_reasoning": 0.33400524285866284, "adv/ratio_step_to_reasoning": 0.3337369298864584, "adv/std_final_conf": 0.16554909944534302, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.16541613638401031, "calib/answer_extract_rate": 0.015625, "calib/auroc": 0.75, "calib/avg_num_step_conf": 0.921875, "calib/ece": 0.07250000000000001, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.25, "calib/gap": 0.3549999999999999, "calib/mean_conf": 0.5325, "calib/mu_c": 0.71, "calib/mu_w": 0.35500000000000004, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.91015625, "calib/pce": 0.05250000000000001, "calib/std_conf": 0.2693858756505248, "calib/step_conf_rate": 0.91015625, "calib/step_q_w": 0.5333563559322034, "calib/step_q_w_n": 236.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2107.0, "completions/max_terminated_length": 2107.0, "completions/mean_length": 187.48046875, "completions/mean_terminated_length": 187.48046875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.1952, "grad_norm": 0.022845527157187462, "kl": 0.2704925537109375, "learning_rate": 4.7222222222222226e-07, "loss": -0.0812, "mask/has_final_conf_rate": 0.015625, "mask/share_final_conf": 0.0006599759799428284, "mask/share_reasoning": 0.9025527834892273, "mask/share_step_conf": 0.09678725898265839, "num_tokens": 35224426.0, "reward": 0.0018502858001738787, "reward_std": 0.005233398173004389, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.002724609337747097, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.0013677878305315971, "step": 183 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 0.921875, "calib/ece": 0.0, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.0, "calib/mu_c": NaN, "calib/mu_w": 0.0, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.0, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.91796875, "calib/step_q_w": 0.551475988700565, "calib/step_q_w_n": 236.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1173.0, "completions/max_terminated_length": 1173.0, "completions/mean_length": 177.40234375, "completions/mean_terminated_length": 177.40234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.19626666666666667, "grad_norm": 0.0030443009454756975, "kl": 0.288055419921875, "learning_rate": 4.444444444444445e-07, "loss": 0.0214, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.000396286224713549, "mask/share_reasoning": 0.892707109451294, "mask/share_step_conf": 0.10689658671617508, "num_tokens": 35375121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 184 }, { "adv/mean_abs_final_conf": 0.01931559108197689, "adv/mean_abs_reasoning": 0.05784125626087189, "adv/mean_abs_step_conf": 0.019306572154164314, "adv/ratio_final_to_reasoning": 0.33394141708923064, "adv/ratio_step_to_reasoning": 0.33378549157177817, "adv/std_final_conf": 0.16551747918128967, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.16544018685817719, "calib/answer_extract_rate": 0.0234375, "calib/auroc": 0.125, "calib/avg_num_step_conf": 0.84375, "calib/ece": 0.6966666666666667, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": -0.245, "calib/mean_conf": 0.7633333333333333, "calib/mu_c": 0.6, "calib/mu_w": 0.845, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.85546875, "calib/nonempty_step_conf_rate": 0.8359375, "calib/pce": 0.5633333333333332, "calib/std_conf": 0.17123732718721763, "calib/step_conf_rate": 0.8359375, "calib/step_q_w": 0.5409444444444444, "calib/step_q_w_n": 216.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2607.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 206.65234375, "completions/mean_terminated_length": 206.65234375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.19733333333333333, "grad_norm": 0.02343006432056427, "kl": 0.2517852783203125, "learning_rate": 4.1666666666666667e-07, "loss": -0.0215, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0009006580803543329, "mask/share_reasoning": 0.9018073678016663, "mask/share_step_conf": 0.09729200601577759, "num_tokens": 35534944.0, "reward": 0.001522548496723175, "reward_std": 0.004306417424231768, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.0022046875674277544, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.0015033404342830181, "step": 185 }, { "adv/mean_abs_final_conf": 0.038623660802841187, "adv/mean_abs_reasoning": 0.06384889036417007, "adv/mean_abs_step_conf": 0.03805459290742874, "adv/ratio_final_to_reasoning": 0.6049229764612406, "adv/ratio_step_to_reasoning": 0.5960102468559696, "adv/std_final_conf": 0.23403151333332062, "adv/std_reasoning": 0.2863609790802002, "adv/std_step_conf": 0.23061054944992065, "calib/answer_extract_rate": 0.0234375, "calib/auroc": 0.8333333333333333, "calib/avg_num_step_conf": 0.9375, "calib/ece": 0.262, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.17833333333333334, "calib/mean_conf": 0.662, "calib/mu_c": 0.7333333333333333, "calib/mu_w": 0.5549999999999999, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.9375, "calib/pce": 0.16199999999999998, "calib/std_conf": 0.23292917378465067, "calib/step_conf_rate": 0.9375, "calib/step_q_c": 0.98, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.463876150627615, "calib/step_q_w": 0.516123849372385, "calib/step_q_w_n": 239.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 198.6484375, "completions/mean_terminated_length": 198.6484375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.1984, "grad_norm": 0.028277983888983727, "kl": 0.2912445068359375, "learning_rate": 3.8888888888888895e-07, "loss": -0.1085, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.0014739439357072115, "mask/share_reasoning": 0.8970207571983337, "mask/share_step_conf": 0.10150527954101562, "num_tokens": 35690838.0, "reward": 0.003056081011891365, "reward_std": 0.010295093059539795, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.005044922232627869, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.002839010441675782, "step": 186 }, { "adv/mean_abs_final_conf": 0.0579669326543808, "adv/mean_abs_reasoning": 0.05784125626087189, "adv/mean_abs_step_conf": 0.05779435485601425, "adv/ratio_final_to_reasoning": 1.0021727811882593, "adv/ratio_step_to_reasoning": 0.9991891357849126, "adv/std_final_conf": 0.28678441047668457, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.28593212366104126, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.8828125, "calib/ece": 0.2025, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.2025, "calib/mu_c": NaN, "calib/mu_w": 0.2025, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.87109375, "calib/nonempty_step_conf_rate": 0.8671875, "calib/pce": 0.2025, "calib/std_conf": 0.21787324296480282, "calib/step_conf_rate": 0.8671875, "calib/step_q_w": 0.54857802359882, "calib/step_q_w_n": 226.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1382.0, "completions/max_terminated_length": 1382.0, "completions/mean_length": 218.73828125, "completions/mean_terminated_length": 218.73828125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.19946666666666665, "grad_norm": 0.03050275146961212, "kl": 0.286041259765625, "learning_rate": 3.611111111111111e-07, "loss": -0.1996, "mask/has_final_conf_rate": 0.015625, "mask/share_final_conf": 0.0014880726812407374, "mask/share_reasoning": 0.9059315919876099, "mask/share_step_conf": 0.09258037060499191, "num_tokens": 35848379.0, "reward": 0.004234543535858393, "reward_std": 0.011977097019553185, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.010336718522012234, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.004211381543427706, "step": 187 }, { "adv/mean_abs_final_conf": 0.019320379942655563, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.019292619079351425, "adv/ratio_final_to_reasoning": 1.0020726307629646, "adv/ratio_step_to_reasoning": 1.0006327832337754, "adv/std_final_conf": 0.16555850207805634, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.16532061994075775, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.93359375, "calib/ece": 0.43333333333333335, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.43333333333333335, "calib/mu_c": NaN, "calib/mu_w": 0.43333333333333335, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.90625, "calib/nonempty_step_conf_rate": 0.8984375, "calib/pce": 0.43333333333333335, "calib/std_conf": 0.32998316455372223, "calib/step_conf_rate": 0.8984375, "calib/step_q_w": 0.5555447698744769, "calib/step_q_w_n": 239.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1503.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 208.265625, "completions/mean_terminated_length": 208.265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20053333333333334, "grad_norm": 0.025662289932370186, "kl": 0.23193359375, "learning_rate": 3.3333333333333335e-07, "loss": -0.0553, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0011713827261701226, "mask/share_reasoning": 0.893328845500946, "mask/share_step_conf": 0.105499766767025, "num_tokens": 36005767.0, "reward": 0.0013519477797672153, "reward_std": 0.0038238859269768, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0029296875, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.0010070418938994408, "step": 188 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.9453125, "calib/ece": 0.67, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.67, "calib/mu_c": NaN, "calib/mu_w": 0.67, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.67, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.921875, "calib/step_q_w": 0.5065026170798899, "calib/step_q_w_n": 242.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 180.62890625, "completions/mean_terminated_length": 180.62890625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2016, "grad_norm": 0.002889833180233836, "kl": 0.284515380859375, "learning_rate": 3.055555555555556e-07, "loss": 0.0214, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.00011811555305030197, "mask/share_reasoning": 0.8950053453445435, "mask/share_step_conf": 0.1048765480518341, "num_tokens": 36159776.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 189 }, { "adv/mean_abs_final_conf": 0.019304616376757622, "adv/mean_abs_reasoning": 0.038560837507247925, "adv/mean_abs_step_conf": 0.01932068169116974, "adv/ratio_final_to_reasoning": 0.5006275180908379, "adv/ratio_step_to_reasoning": 0.5010441406397931, "adv/std_final_conf": 0.1654234230518341, "adv/std_reasoning": 0.233650803565979, "adv/std_step_conf": 0.16556109488010406, "calib/answer_extract_rate": 0.02734375, "calib/auroc": 0.4, "calib/avg_num_step_conf": 0.921875, "calib/ece": 0.4116666666666666, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.22999999999999998, "calib/mean_conf": 0.3316666666666667, "calib/mu_c": 0.14, "calib/mu_w": 0.37, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.93359375, "calib/nonempty_step_conf_rate": 0.91015625, "calib/pce": 0.28833333333333333, "calib/std_conf": 0.30212672102208293, "calib/step_conf_rate": 0.91015625, "calib/step_q_w": 0.5487427966101693, "calib/step_q_w_n": 236.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 201.04296875, "completions/mean_terminated_length": 201.04296875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.20266666666666666, "grad_norm": 0.03346448019146919, "kl": 0.2524871826171875, "learning_rate": 2.7777777777777776e-07, "loss": -0.0267, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0015675068134441972, "mask/share_reasoning": 0.9004506468772888, "mask/share_step_conf": 0.09798184782266617, "num_tokens": 36316851.0, "reward": -1.1547759640961885e-05, "reward_std": 0.0022423705086112022, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0014062500558793545, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.0029918455984443426, "step": 190 }, { "adv/mean_abs_final_conf": 0.019319752231240273, "adv/mean_abs_reasoning": 0.038560837507247925, "adv/mean_abs_step_conf": 0.01928817853331566, "adv/ratio_final_to_reasoning": 0.5010200369120332, "adv/ratio_step_to_reasoning": 0.5002012347291533, "adv/std_final_conf": 0.16555313766002655, "adv/std_reasoning": 0.233650803565979, "adv/std_step_conf": 0.16528257727622986, "calib/answer_extract_rate": 0.0078125, "calib/auroc": 1.0, "calib/avg_num_step_conf": 0.9921875, "calib/ece": 0.34, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.31999999999999995, "calib/mean_conf": 0.69, "calib/mu_c": 0.85, "calib/mu_w": 0.53, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.93359375, "calib/nonempty_step_conf_rate": 0.9296875, "calib/pce": 0.265, "calib/std_conf": 0.15999999999999998, "calib/step_conf_rate": 0.9296875, "calib/step_q_w": 0.5243385826771654, "calib/step_q_w_n": 254.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1479.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 152.26171875, "completions/mean_terminated_length": 152.26171875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.20373333333333332, "grad_norm": 0.022792931646108627, "kl": 0.31573486328125, "learning_rate": 2.5000000000000004e-07, "loss": -0.073, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.0003579213807824999, "mask/share_reasoning": 0.8862248659133911, "mask/share_step_conf": 0.11341720819473267, "num_tokens": 36459998.0, "reward": 0.0017301104962825775, "reward_std": 0.004893491044640541, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0028089843690395355, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.0009112633997574449, "step": 191 }, { "adv/mean_abs_final_conf": 0.0386478528380394, "adv/mean_abs_reasoning": 0.10836289823055267, "adv/mean_abs_step_conf": 0.03857652470469475, "adv/ratio_final_to_reasoning": 0.35665207805546423, "adv/ratio_step_to_reasoning": 0.35599384415336893, "adv/std_final_conf": 0.2341780662536621, "adv/std_reasoning": 0.36953291296958923, "adv/std_step_conf": 0.23374615609645844, "calib/answer_extract_rate": 0.03515625, "calib/auroc": 0.8333333333333334, "calib/avg_num_step_conf": 0.8984375, "calib/ece": 0.21571428571428572, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.42857142857142855, "calib/gap": 0.49, "calib/mean_conf": 0.55, "calib/mu_c": 0.76, "calib/mu_w": 0.27, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.89453125, "calib/pce": 0.09714285714285714, "calib/std_conf": 0.42311768035449837, "calib/step_conf_rate": 0.89453125, "calib/step_q_w": 0.45417101449275354, "calib/step_q_w_n": 230.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1404.0, "completions/max_terminated_length": 1404.0, "completions/mean_length": 214.1796875, "completions/mean_terminated_length": 214.1796875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2048, "grad_norm": 0.03924354165792465, "kl": 0.2512054443359375, "learning_rate": 2.2222222222222224e-07, "loss": -0.1604, "mask/has_final_conf_rate": 0.02734375, "mask/share_final_conf": 0.0009351474000141025, "mask/share_reasoning": 0.897925078868866, "mask/share_step_conf": 0.10113979130983353, "num_tokens": 36619804.0, "reward": 0.004981360863894224, "reward_std": 0.012563186697661877, "rewards/accuracy_reward_step": 0.01953125, "rewards/final_brier_reward_step": 0.007746484130620956, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.0032525130081921816, "step": 192 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16521605849266052, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/auroc": 0.0, "calib/avg_num_step_conf": 0.90625, "calib/ece": 0.74, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": -0.18666666666666676, "calib/mean_conf": 0.8400000000000001, "calib/mu_c": 0.7, "calib/mu_w": 0.8866666666666667, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.90625, "calib/nonempty_step_conf_rate": 0.890625, "calib/pce": 0.665, "calib/std_conf": 0.08803408430829507, "calib/step_conf_rate": 0.890625, "calib/step_q_w": 0.6004439655172413, "calib/step_q_w_n": 232.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 189.59765625, "completions/mean_terminated_length": 190.3411865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.20586666666666667, "grad_norm": 0.002885238267481327, "kl": 0.249420166015625, "learning_rate": 1.9444444444444447e-07, "loss": 0.0013, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0010916463797912002, "mask/share_reasoning": 0.9009318351745605, "mask/share_step_conf": 0.09407031536102295, "num_tokens": 36774053.0, "reward": 0.0003906250058207661, "reward_std": 0.001104854280129075, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l1_reward": 0.0, "step": 193 }, { "adv/mean_abs_final_conf": 0.038605351001024246, "adv/mean_abs_reasoning": 0.07717613130807877, "adv/mean_abs_step_conf": 0.0577361173927784, "adv/ratio_final_to_reasoning": 0.5002239727062227, "adv/ratio_step_to_reasoning": 0.7481084684369843, "adv/std_final_conf": 0.2339206337928772, "adv/std_reasoning": 0.3306655287742615, "adv/std_step_conf": 0.28564655780792236, "calib/answer_extract_rate": 0.02734375, "calib/auroc": 0.0, "calib/avg_num_step_conf": 0.91796875, "calib/ece": 0.43833333333333324, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.2875, "calib/mean_conf": 0.25166666666666665, "calib/mu_c": 0.06, "calib/mu_w": 0.3475, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.90625, "calib/pce": 0.1783333333333333, "calib/std_conf": 0.20843997270730538, "calib/step_conf_rate": 0.90625, "calib/step_q_c": 0.135, "calib/step_q_c_n": 2.0, "calib/step_q_gap": -0.397555078683834, "calib/step_q_w": 0.532555078683834, "calib/step_q_w_n": 233.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 187.640625, "completions/mean_terminated_length": 188.37648010253906, "completions/min_length": 0.0, "completions/min_terminated_length": 28.0, "epoch": 0.20693333333333333, "grad_norm": 0.029423270374536514, "kl": 0.246551513671875, "learning_rate": 1.6666666666666668e-07, "loss": -0.1481, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.0009616934694349766, "mask/share_reasoning": 0.9088914394378662, "mask/share_step_conf": 0.08624064922332764, "num_tokens": 36928033.0, "reward": 0.00113745778799057, "reward_std": 0.009823394939303398, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.0035183595027774572, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.005930944345891476, "step": 194 }, { "adv/mean_abs_final_conf": 0.057969868183135986, "adv/mean_abs_reasoning": 0.1246509775519371, "adv/mean_abs_step_conf": 0.05788629502058029, "adv/ratio_final_to_reasoning": 0.4650574694368702, "adv/ratio_step_to_reasoning": 0.46438701209913397, "adv/std_final_conf": 0.28679895401000977, "adv/std_reasoning": 0.4047554135322571, "adv/std_step_conf": 0.28638574481010437, "calib/answer_extract_rate": 0.04296875, "calib/auroc": 0.41666666666666663, "calib/avg_num_step_conf": 0.90234375, "calib/ece": 0.5057142857142857, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.2857142857142857, "calib/gap": -0.05166666666666664, "calib/mean_conf": 0.3628571428571429, "calib/mu_c": 0.3333333333333333, "calib/mu_w": 0.38499999999999995, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.89453125, "calib/pce": 0.21999999999999997, "calib/std_conf": 0.41337337967073806, "calib/step_conf_rate": 0.89453125, "calib/step_q_w": 0.5457682539682539, "calib/step_q_w_n": 231.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2341.0, "completions/max_terminated_length": 2341.0, "completions/mean_length": 199.84375, "completions/mean_terminated_length": 199.84375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.208, "grad_norm": 0.03922109678387642, "kl": 0.25457763671875, "learning_rate": 1.3888888888888888e-07, "loss": -0.2147, "mask/has_final_conf_rate": 0.02734375, "mask/share_final_conf": 0.0022088377736508846, "mask/share_reasoning": 0.8914566040039062, "mask/share_step_conf": 0.10633458197116852, "num_tokens": 37085177.0, "reward": 0.0062236604280769825, "reward_std": 0.015905946493148804, "rewards/accuracy_reward_step": 0.01953125, "rewards/final_brier_reward_step": 0.011027734726667404, "rewards/format_reward_step": 0.01171875, "rewards/step_l1_reward": -0.004830413497984409, "step": 195 }, { "adv/mean_abs_final_conf": 0.01932217739522457, "adv/mean_abs_reasoning": 0.038560837507247925, "adv/mean_abs_step_conf": 0.01932252198457718, "adv/ratio_final_to_reasoning": 0.5010829288029016, "adv/ratio_step_to_reasoning": 0.5010918650546763, "adv/std_final_conf": 0.16557389497756958, "adv/std_reasoning": 0.233650803565979, "adv/std_step_conf": 0.16557686030864716, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 0.96484375, "calib/ece": 0.38, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.38, "calib/mu_c": NaN, "calib/mu_w": 0.38, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.94140625, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.38, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.93359375, "calib/step_q_w": 0.5592665317139002, "calib/step_q_w_n": 247.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 162.5625, "completions/mean_terminated_length": 162.5625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.20906666666666668, "grad_norm": 0.02420944906771183, "kl": 0.298065185546875, "learning_rate": 1.1111111111111112e-07, "loss": -0.0729, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.00036951014772057533, "mask/share_reasoning": 0.8943559527397156, "mask/share_step_conf": 0.10527454316616058, "num_tokens": 37229337.0, "reward": 0.0007347895880229771, "reward_std": 0.0020782987121492624, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.003342187497764826, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.003435108345001936, "step": 196 }, { "adv/mean_abs_final_conf": 0.01926400512456894, "adv/mean_abs_reasoning": 0.05784125626087189, "adv/mean_abs_step_conf": 0.01931987702846527, "adv/ratio_final_to_reasoning": 0.3330495630607619, "adv/ratio_step_to_reasoning": 0.33401551552286507, "adv/std_final_conf": 0.16507543623447418, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.16555418074131012, "calib/answer_extract_rate": 0.0234375, "calib/auroc": 0.875, "calib/avg_num_step_conf": 0.94140625, "calib/ece": 0.23666666666666664, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": 0.4974999999999999, "calib/mean_conf": 0.4333333333333333, "calib/mu_c": 0.7649999999999999, "calib/mu_w": 0.2675, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.91015625, "calib/pce": 0.1683333333333333, "calib/std_conf": 0.4008186067980821, "calib/step_conf_rate": 0.91015625, "calib/step_q_w": 0.4886099585062241, "calib/step_q_w_n": 241.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 157.53125, "completions/mean_terminated_length": 157.53125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.21013333333333334, "grad_norm": 0.025260915979743004, "kl": 0.313568115234375, "learning_rate": 8.333333333333334e-08, "loss": -0.0755, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0016457033343613148, "mask/share_reasoning": 0.8901784420013428, "mask/share_step_conf": 0.10817582905292511, "num_tokens": 37374721.0, "reward": 5.584879545494914e-05, "reward_std": 0.004261452704668045, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.0006000000284984708, "rewards/format_reward_step": 0.00390625, "rewards/step_l1_reward": -0.00283205253072083, "step": 197 }, { "adv/mean_abs_final_conf": 0.03863754868507385, "adv/mean_abs_reasoning": 0.05784125626087189, "adv/mean_abs_step_conf": 0.03862864524126053, "adv/ratio_final_to_reasoning": 0.6679929030381582, "adv/ratio_step_to_reasoning": 0.6678389740886698, "adv/std_final_conf": 0.2341156303882599, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.23406170308589935, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.9296875, "calib/ece": 0.3533333333333333, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.3533333333333333, "calib/mu_c": NaN, "calib/mu_w": 0.3533333333333333, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.3533333333333333, "calib/std_conf": 0.281701181317288, "calib/step_conf_rate": 0.921875, "calib/step_q_w": 0.5330756302521008, "calib/step_q_w_n": 238.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2469.0, "completions/max_terminated_length": 2469.0, "completions/mean_length": 180.39453125, "completions/mean_terminated_length": 180.39453125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.2112, "grad_norm": 0.04118936508893967, "kl": 0.31329345703125, "learning_rate": 5.555555555555556e-08, "loss": -0.1535, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0013754223473370075, "mask/share_reasoning": 0.8814821243286133, "mask/share_step_conf": 0.11714246869087219, "num_tokens": 37526286.0, "reward": 0.002013332908973098, "reward_std": 0.005694565363228321, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.005898046772927046, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.004215131048113108, "step": 198 }, { "adv/mean_abs_final_conf": 0.03863748535513878, "adv/mean_abs_reasoning": 0.038560837507247925, "adv/mean_abs_step_conf": 0.03851601108908653, "adv/ratio_final_to_reasoning": 1.0019877122190732, "adv/ratio_step_to_reasoning": 0.9988375144042718, "adv/std_final_conf": 0.23411524295806885, "adv/std_reasoning": 0.233650803565979, "adv/std_step_conf": 0.2333802878856659, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 0.9453125, "calib/ece": 0.495, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.495, "calib/mu_c": NaN, "calib/mu_w": 0.495, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.9453125, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.495, "calib/std_conf": 0.18500000000000003, "calib/step_conf_rate": 0.9453125, "calib/step_q_w": 0.5458679063360882, "calib/step_q_w_n": 242.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3044.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 174.23046875, "completions/mean_terminated_length": 174.23046875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21226666666666666, "grad_norm": 0.040901776403188705, "kl": 0.3167266845703125, "learning_rate": 2.777777777777778e-08, "loss": -0.1257, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.0010554521577432752, "mask/share_reasoning": 0.8885200023651123, "mask/share_step_conf": 0.11042454093694687, "num_tokens": 37675089.0, "reward": 0.0022908253595232964, "reward_std": 0.0064794328063726425, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.005630859173834324, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.0026117085944861174, "step": 199 }, { "adv/mean_abs_final_conf": 0.038646847009658813, "adv/mean_abs_reasoning": 0.0578957125544548, "adv/mean_abs_step_conf": 0.0386156402528286, "adv/ratio_final_to_reasoning": 0.6675251984040935, "adv/ratio_step_to_reasoning": 0.6669861816884626, "adv/std_final_conf": 0.23417198657989502, "adv/std_reasoning": 0.2864321172237396, "adv/std_step_conf": 0.2339828908443451, "calib/answer_extract_rate": 0.02734375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 0.921875, "calib/ece": 0.26600000000000007, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.2, "calib/gap": 0.21833333333333338, "calib/mean_conf": 0.746, "calib/mu_c": 0.8333333333333334, "calib/mu_w": 0.615, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.92578125, "calib/nonempty_step_conf_rate": 0.90625, "calib/pce": 0.20600000000000007, "calib/std_conf": 0.2646960521050512, "calib/step_conf_rate": 0.90625, "calib/step_q_c": 0.485, "calib/step_q_c_n": 2.0, "calib/step_q_gap": -0.08322578347578347, "calib/step_q_w": 0.5682257834757835, "calib/step_q_w_n": 234.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2144.0, "completions/max_terminated_length": 2144.0, "completions/mean_length": 215.08984375, "completions/mean_terminated_length": 215.08984375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.21333333333333335, "grad_norm": 0.05663759633898735, "kl": 0.2462615966796875, "learning_rate": 0.0, "loss": -0.1622, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.0006202845834195614, "mask/share_reasoning": 0.9045542478561401, "mask/share_step_conf": 0.09482549130916595, "num_tokens": 37838200.0, "reward": 0.004006184637546539, "reward_std": 0.01133120059967041, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.00742187537252903, "rewards/format_reward_step": 0.0078125, "rewards/step_l1_reward": -0.003315756330266595, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.061138040876830925, "train_runtime": 9391.5886, "train_samples_per_second": 5.452, "train_steps_per_second": 0.021 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 37838200, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }