{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.0070291250012815, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": 0.0516, "num_tokens": 229171.0, "reward": 0.5306904315948486, "reward_std": 0.15138749778270721, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.016632115468382835, "step": 1 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.007271615322679281, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0125, "num_tokens": 458661.0, "reward": 0.47535353899002075, "reward_std": 0.15537551045417786, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.012380896136164665, "step": 2 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.49794238683127573, "calib/avg_num_step_conf": 5.09375, "calib/ece": 0.24564705882352944, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": -0.0007188371166866325, "calib/mean_conf": 0.8809411764705882, "calib/mu_c": 0.8806790123456791, "calib/mu_w": 0.8813978494623658, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24564705882352944, "calib/std_conf": 0.04406653248090241, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7875328083989501, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.028252365594522044, "calib/step_q_w": 0.7592804428044281, "calib/step_q_w_n": 542.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1591.0, "completions/max_terminated_length": 1591.0, "completions/mean_length": 502.5078125, "completions/mean_terminated_length": 504.47845458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0032, "grad_norm": 0.007185729686170816, "kl": 0.0012685656547546387, "learning_rate": 7.5e-07, "loss": 0.0124, "num_tokens": 692559.0, "reward": 0.5189927220344543, "reward_std": 0.13065262138843536, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7026242017745972, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.01036119181662798, "step": 3 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4632880529432254, "calib/avg_num_step_conf": 5.19921875, "calib/ece": 0.22222222222222235, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.30158730158730157, "calib/gap": 0.0034691745036575794, "calib/mean_conf": 0.876984126984127, "calib/mu_c": 0.8781818181818184, "calib/mu_w": 0.8747126436781608, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.22222222222222235, "calib/std_conf": 0.05825191380832978, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7934677419354839, "calib/step_q_c_n": 868.0, "calib/step_q_gap": 0.014720441719501154, "calib/step_q_w": 0.7787473002159827, "calib/step_q_w_n": 463.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2456.0, "completions/max_terminated_length": 2456.0, "completions/mean_length": 523.35546875, "completions/mean_terminated_length": 525.4078979492188, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.004266666666666667, "grad_norm": 0.007038692943751812, "kl": 0.0002740919589996338, "learning_rate": 1.0000000000000002e-06, "loss": 0.046, "num_tokens": 932706.0, "reward": 0.5242385268211365, "reward_std": 0.1526789367198944, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7104023694992065, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.013855919241905212, "step": 4 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.4222260040844112, "calib/avg_num_step_conf": 5.0390625, "calib/ece": 0.3433333333333333, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.2674897119341564, "calib/gap": -0.012205582028590789, "calib/mean_conf": 0.8779835390946501, "calib/mu_c": 0.8723076923076923, "calib/mu_w": 0.8845132743362831, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.34316872427983536, "calib/std_conf": 0.04670685809638801, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.7886736214605068, "calib/step_q_c_n": 671.0, "calib/step_q_gap": 0.007429679618826568, "calib/step_q_w": 0.7812439418416802, "calib/step_q_w_n": 619.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2678.0, "completions/max_terminated_length": 2678.0, "completions/mean_length": 538.12109375, "completions/mean_terminated_length": 542.3582763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.005333333333333333, "grad_norm": 0.0068595572374761105, "kl": 0.00029274821281433105, "learning_rate": 1.25e-06, "loss": 0.0803, "num_tokens": 1177153.0, "reward": 0.44816750288009644, "reward_std": 0.13739368319511414, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5918129086494446, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.014678382314741611, "step": 5 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4842447587774691, "calib/avg_num_step_conf": 5.44921875, "calib/ece": 0.2972156862745098, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.25882352941176473, "calib/gap": -0.002398964384945579, "calib/mean_conf": 0.8758039215686274, "calib/mu_c": 0.8747972972972973, "calib/mu_w": 0.8771962616822429, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2963137254901961, "calib/std_conf": 0.04259292936251419, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7968199233716474, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.0017872436330853558, "calib/step_q_w": 0.7950326797385621, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2477.0, "completions/max_terminated_length": 2477.0, "completions/mean_length": 451.4921875, "completions/mean_terminated_length": 451.4921875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.0064, "grad_norm": 0.008030211552977562, "kl": 0.00039631128311157227, "learning_rate": 1.5e-06, "loss": 0.0086, "num_tokens": 1398687.0, "reward": 0.4995495676994324, "reward_std": 0.11786103248596191, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6636097431182861, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.02064560167491436, "step": 6 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.45714188914487064, "calib/avg_num_step_conf": 5.5078125, "calib/ece": 0.25796812749004, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.3466135458167331, "calib/gap": -0.006704160455346164, "calib/mean_conf": 0.8834661354581673, "calib/mu_c": 0.8809554140127389, "calib/mu_w": 0.8876595744680851, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25796812749004, "calib/std_conf": 0.04749017353386353, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8038183934807916, "calib/step_q_c_n": 859.0, "calib/step_q_gap": 0.02223944611237061, "calib/step_q_w": 0.781578947368421, "calib/step_q_w_n": 551.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 542.875, "completions/mean_terminated_length": 545.0039672851562, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.007466666666666667, "grad_norm": 0.006746398750692606, "kl": 0.005849212408065796, "learning_rate": 1.75e-06, "loss": 0.0181, "num_tokens": 1645087.0, "reward": 0.5083991289138794, "reward_std": 0.14332298934459686, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6798004508018494, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.01902913488447666, "step": 7 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5029393173198482, "calib/avg_num_step_conf": 4.95703125, "calib/ece": 0.31596837944664025, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2766798418972332, "calib/gap": 0.01349873577749694, "calib/mean_conf": 0.8693280632411067, "calib/mu_c": 0.875357142857143, "calib/mu_w": 0.861858407079646, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.31596837944664025, "calib/std_conf": 0.08297782385661961, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7995081967213116, "calib/step_q_c_n": 671.0, "calib/step_q_gap": 0.023086792039037296, "calib/step_q_w": 0.7764214046822743, "calib/step_q_w_n": 598.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2356.0, "completions/max_terminated_length": 2356.0, "completions/mean_length": 528.59765625, "completions/mean_terminated_length": 528.59765625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.008533333333333334, "grad_norm": 0.006836770102381706, "kl": 0.0005608052015304565, "learning_rate": 2.0000000000000003e-06, "loss": 0.0212, "num_tokens": 1886920.0, "reward": 0.4802365303039551, "reward_std": 0.1404484510421753, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6440644264221191, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.01015863474458456, "step": 8 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.45059007053716765, "calib/avg_num_step_conf": 5.109375, "calib/ece": 0.26429718875502006, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.24497991967871485, "calib/gap": -0.004903011394465584, "calib/mean_conf": 0.8747389558232932, "calib/mu_c": 0.872828947368421, "calib/mu_w": 0.8777319587628866, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.26429718875502006, "calib/std_conf": 0.04688737390643053, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7938685015290521, "calib/step_q_c_n": 654.0, "calib/step_q_gap": 0.06206422018348634, "calib/step_q_w": 0.7318042813455657, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3060.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 504.71484375, "completions/mean_terminated_length": 508.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 189.0, "epoch": 0.0096, "grad_norm": 0.007662550546228886, "kl": 0.00041669607162475586, "learning_rate": 2.25e-06, "loss": 0.0244, "num_tokens": 2123663.0, "reward": 0.4904530942440033, "reward_std": 0.15297287702560425, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6537765860557556, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.016973400488495827, "step": 9 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5013053998981152, "calib/avg_num_step_conf": 5.03515625, "calib/ece": 0.2852549019607844, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.23529411764705882, "calib/gap": 0.0006628884360672105, "calib/mean_conf": 0.8774117647058823, "calib/mu_c": 0.877682119205298, "calib/mu_w": 0.8770192307692308, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2852549019607844, "calib/std_conf": 0.04070418221543076, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.789935373645681, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 1.3802463411094479e-06, "calib/step_q_w": 0.7899339933993399, "calib/step_q_w_n": 606.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2195.0, "completions/max_terminated_length": 2195.0, "completions/mean_length": 510.5390625, "completions/mean_terminated_length": 510.5390625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.010666666666666666, "grad_norm": 0.007049976848065853, "kl": 0.00035446882247924805, "learning_rate": 2.5e-06, "loss": 0.0477, "num_tokens": 2361161.0, "reward": 0.5098337531089783, "reward_std": 0.1368577778339386, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6724746227264404, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.030786586925387383, "step": 10 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.37179744464512676, "calib/avg_num_step_conf": 5.45703125, "calib/ece": 0.28284, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.312, "calib/gap": -0.023507258010569188, "calib/mean_conf": 0.8796399999999999, "calib/mu_c": 0.8703311258278145, "calib/mu_w": 0.8938383838383837, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.27924, "calib/std_conf": 0.05583072988955097, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7897932816537467, "calib/step_q_c_n": 774.0, "calib/step_q_gap": -0.005856798603074975, "calib/step_q_w": 0.7956500802568217, "calib/step_q_w_n": 623.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 531.0703125, "completions/mean_terminated_length": 537.3676147460938, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.011733333333333333, "grad_norm": 0.006753360386937857, "kl": 0.0003243088722229004, "learning_rate": 2.7500000000000004e-06, "loss": 0.019, "num_tokens": 2601595.0, "reward": 0.49244898557662964, "reward_std": 0.14083197712898254, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6497530937194824, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.023426111787557602, "step": 11 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.46517780717453894, "calib/avg_num_step_conf": 5.55078125, "calib/ece": 0.1634523809523809, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2976190476190476, "calib/gap": -0.007576842269084061, "calib/mean_conf": 0.8763888888888889, "calib/mu_c": 0.8742541436464089, "calib/mu_w": 0.8818309859154929, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.16079365079365074, "calib/std_conf": 0.05049021156466664, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7882559456398641, "calib/step_q_c_n": 883.0, "calib/step_q_gap": 0.0034418192458119945, "calib/step_q_w": 0.7848141263940521, "calib/step_q_w_n": 538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 484.59765625, "completions/mean_terminated_length": 486.4980773925781, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.0128, "grad_norm": 0.007357874885201454, "kl": 0.0006407797336578369, "learning_rate": 3e-06, "loss": 0.0476, "num_tokens": 2829828.0, "reward": 0.5489996671676636, "reward_std": 0.1275169402360916, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7473390698432922, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.013941464945673943, "step": 12 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5545263559969442, "calib/avg_num_step_conf": 4.95703125, "calib/ece": 0.2750390625, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.26953125, "calib/gap": 0.005106951871657883, "calib/mean_conf": 0.8766015625000001, "calib/mu_c": 0.8786363636363637, "calib/mu_w": 0.8735294117647058, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2750390625, "calib/std_conf": 0.047841835746118624, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7919701086956523, "calib/step_q_c_n": 736.0, "calib/step_q_gap": 0.014352847907659894, "calib/step_q_w": 0.7776172607879924, "calib/step_q_w_n": 533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 456.11328125, "completions/mean_terminated_length": 457.9019775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.013866666666666666, "grad_norm": 0.00735123734921217, "kl": 0.0006067156791687012, "learning_rate": 3.2500000000000002e-06, "loss": -0.0061, "num_tokens": 3051185.0, "reward": 0.5164021253585815, "reward_std": 0.1543343961238861, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6848277449607849, "rewards/format_reward_step": 1.0, "rewards/step_margin_reward": 0.027664033696055412, "step": 13 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4406148867313916, "calib/avg_num_step_conf": 5.65234375, "calib/ece": 0.2902766798418972, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3359683794466403, "calib/gap": -0.005965695792880088, "calib/mean_conf": 0.8831620553359684, "calib/mu_c": 0.8807333333333335, "calib/mu_w": 0.8866990291262136, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2902766798418972, "calib/std_conf": 0.044155239785541624, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7921544209215442, "calib/step_q_c_n": 803.0, "calib/step_q_gap": 0.01664199856129578, "calib/step_q_w": 0.7755124223602484, "calib/step_q_w_n": 644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2817.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 544.5859375, "completions/mean_terminated_length": 546.7216186523438, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.014933333333333333, "grad_norm": 0.007078804075717926, "kl": 0.0010630488395690918, "learning_rate": 3.5e-06, "loss": -0.0326, "num_tokens": 3295999.0, "reward": 0.49770310521125793, "reward_std": 0.1487538069486618, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.661691427230835, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.018871046602725983, "step": 14 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.46976936799184504, "calib/avg_num_step_conf": 5.12109375, "calib/ece": 0.31177865612648215, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.33992094861660077, "calib/gap": -0.003974898063200838, "calib/mean_conf": 0.8809486166007905, "calib/mu_c": 0.8792361111111111, "calib/mu_w": 0.883211009174312, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31177865612648215, "calib/std_conf": 0.04669183862870357, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7845135135135135, "calib/step_q_c_n": 740.0, "calib/step_q_gap": -0.008341127467222043, "calib/step_q_w": 0.7928546409807355, "calib/step_q_w_n": 571.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2718.0, "completions/max_terminated_length": 2718.0, "completions/mean_length": 471.171875, "completions/mean_terminated_length": 473.0196228027344, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.016, "grad_norm": 0.007281082682311535, "kl": 0.0008074045181274414, "learning_rate": 3.7500000000000005e-06, "loss": 0.0353, "num_tokens": 3524499.0, "reward": 0.4907722473144531, "reward_std": 0.1448393166065216, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6457914113998413, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.025596803054213524, "step": 15 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.47812584391034296, "calib/avg_num_step_conf": 6.42578125, "calib/ece": 0.24822134387351769, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3241106719367589, "calib/gap": -0.003385093167701858, "calib/mean_conf": 0.884584980237154, "calib/mu_c": 0.8833540372670807, "calib/mu_w": 0.8867391304347826, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24822134387351769, "calib/std_conf": 0.046857003177099794, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7781346678798908, "calib/step_q_c_n": 1099.0, "calib/step_q_gap": 0.02170609645131938, "calib/step_q_w": 0.7564285714285715, "calib/step_q_w_n": 546.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2532.0, "completions/max_terminated_length": 2532.0, "completions/mean_length": 634.10546875, "completions/mean_terminated_length": 634.10546875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.017066666666666667, "grad_norm": 0.006321294233202934, "kl": 0.0008182525634765625, "learning_rate": 4.000000000000001e-06, "loss": 0.0243, "num_tokens": 3795678.0, "reward": 0.5235186219215393, "reward_std": 0.1506902426481247, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6949781179428101, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.02862163446843624, "step": 16 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5686486486486487, "calib/avg_num_step_conf": 5.62890625, "calib/ece": 0.14972549019607842, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.25098039215686274, "calib/gap": 0.010729729729729853, "calib/mean_conf": 0.8727843137254903, "calib/mu_c": 0.8757297297297297, "calib/mu_w": 0.8649999999999999, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.14850980392156862, "calib/std_conf": 0.04758163829630502, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7728096118299445, "calib/step_q_c_n": 1082.0, "calib/step_q_gap": 0.018269221857799667, "calib/step_q_w": 0.7545403899721448, "calib/step_q_w_n": 359.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1184.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 497.2734375, "completions/mean_terminated_length": 499.22357177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.018133333333333335, "grad_norm": 0.007205154746770859, "kl": 0.001395106315612793, "learning_rate": 4.25e-06, "loss": -0.0207, "num_tokens": 4026508.0, "reward": 0.5758857727050781, "reward_std": 0.14907167851924896, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7742902636528015, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.03451257944107056, "step": 17 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4860372340425532, "calib/avg_num_step_conf": 5.578125, "calib/ece": 0.32288537549407115, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.31225296442687744, "calib/gap": -0.0030097517730496293, "calib/mean_conf": 0.880197628458498, "calib/mu_c": 0.8788652482269502, "calib/mu_w": 0.8818749999999999, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32288537549407115, "calib/std_conf": 0.04862903533260498, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7488387978142076, "calib/step_q_c_n": 732.0, "calib/step_q_gap": 0.019413510457885863, "calib/step_q_w": 0.7294252873563217, "calib/step_q_w_n": 696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2594.0, "completions/max_terminated_length": 2594.0, "completions/mean_length": 522.1171875, "completions/mean_terminated_length": 522.1171875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.0192, "grad_norm": 0.007514473982155323, "kl": 0.0019025802612304688, "learning_rate": 4.5e-06, "loss": 0.0443, "num_tokens": 4270890.0, "reward": 0.4936829209327698, "reward_std": 0.134256511926651, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.637388288974762, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.042946361005306244, "step": 18 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5046207162110128, "calib/avg_num_step_conf": 4.73828125, "calib/ece": 0.2918577075098815, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2727272727272727, "calib/gap": 0.009512257733282015, "calib/mean_conf": 0.8728853754940713, "calib/mu_c": 0.8768707482993198, "calib/mu_w": 0.8673584905660378, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2918577075098815, "calib/std_conf": 0.06021022708978, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7552691218130312, "calib/step_q_c_n": 706.0, "calib/step_q_gap": 0.027576814120723547, "calib/step_q_w": 0.7276923076923076, "calib/step_q_w_n": 507.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3030.0, "completions/max_terminated_length": 3030.0, "completions/mean_length": 468.10546875, "completions/mean_terminated_length": 468.10546875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.020266666666666665, "grad_norm": 0.007622662466019392, "kl": 0.0032324790954589844, "learning_rate": 4.75e-06, "loss": 0.031, "num_tokens": 4495485.0, "reward": 0.5108803510665894, "reward_std": 0.14525091648101807, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6640562415122986, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.04598575830459595, "step": 19 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4476190476190476, "calib/avg_num_step_conf": 5.73828125, "calib/ece": 0.28889763779527555, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2559055118110236, "calib/gap": -0.007488654522211702, "calib/mean_conf": 0.8755118110236221, "calib/mu_c": 0.8724161073825502, "calib/mu_w": 0.8799047619047619, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.28889763779527555, "calib/std_conf": 0.04754125703367803, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7442355889724311, "calib/step_q_c_n": 798.0, "calib/step_q_gap": 0.018795946647542916, "calib/step_q_w": 0.7254396423248882, "calib/step_q_w_n": 671.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2643.0, "completions/max_terminated_length": 2643.0, "completions/mean_length": 471.95703125, "completions/mean_terminated_length": 471.95703125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.021333333333333333, "grad_norm": 0.013368158601224422, "kl": 0.0924372673034668, "learning_rate": 5e-06, "loss": 0.0014, "num_tokens": 4721178.0, "reward": 0.5101369023323059, "reward_std": 0.12661463022232056, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6629281044006348, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.042501889169216156, "step": 20 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.45863618342804285, "calib/avg_num_step_conf": 5.609375, "calib/ece": 0.2530980392156864, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2784313725490196, "calib/gap": -0.011480111008325578, "calib/mean_conf": 0.8746666666666667, "calib/mu_c": 0.8704347826086956, "calib/mu_w": 0.8819148936170211, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24819607843137265, "calib/std_conf": 0.05550292932268253, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7002188940092166, "calib/step_q_c_n": 868.0, "calib/step_q_gap": -0.024235331342896127, "calib/step_q_w": 0.7244542253521127, "calib/step_q_w_n": 568.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 475.74609375, "completions/mean_terminated_length": 477.6117858886719, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.0224, "grad_norm": 0.008174967020750046, "kl": 0.006114959716796875, "learning_rate": 4.9722222222222224e-06, "loss": -0.0049, "num_tokens": 4945929.0, "reward": 0.5330367684364319, "reward_std": 0.17249098420143127, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6969093680381775, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.044164177030324936, "step": 21 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5389050901378578, "calib/avg_num_step_conf": 5.84765625, "calib/ece": 0.24429687500000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.27734375, "calib/gap": 0.0049257688229055185, "calib/mean_conf": 0.8755468750000001, "calib/mu_c": 0.8773170731707315, "calib/mu_w": 0.872391304347826, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.239609375, "calib/std_conf": 0.046825950900482256, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7113409563409563, "calib/step_q_c_n": 962.0, "calib/step_q_gap": 0.029228806808246133, "calib/step_q_w": 0.6821121495327102, "calib/step_q_w_n": 535.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1163.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 452.40625, "completions/mean_terminated_length": 454.180419921875, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.023466666666666667, "grad_norm": 0.008732265792787075, "kl": 0.01003265380859375, "learning_rate": 4.944444444444445e-06, "loss": 0.0148, "num_tokens": 5163561.0, "reward": 0.5561990141868591, "reward_std": 0.1382788121700287, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.714662492275238, "rewards/format_reward_step": 1.0, "rewards/step_margin_reward": 0.06961052119731903, "step": 22 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5421323894684551, "calib/avg_num_step_conf": 5.66796875, "calib/ece": 0.34881889763779533, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.23228346456692914, "calib/gap": 0.008319672131147593, "calib/mean_conf": 0.868503937007874, "calib/mu_c": 0.8724999999999999, "calib/mu_w": 0.8641803278688523, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34881889763779533, "calib/std_conf": 0.053212968938409426, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6867546174142481, "calib/step_q_c_n": 758.0, "calib/step_q_gap": 0.005773376433007238, "calib/step_q_w": 0.6809812409812409, "calib/step_q_w_n": 693.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1985.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 502.671875, "completions/mean_terminated_length": 502.671875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.024533333333333334, "grad_norm": 0.0077652414329349995, "kl": 0.012403488159179688, "learning_rate": 4.9166666666666665e-06, "loss": -0.0086, "num_tokens": 5396181.0, "reward": 0.4989694654941559, "reward_std": 0.19842839241027832, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6251125335693359, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.07126393914222717, "step": 23 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5088501334027461, "calib/avg_num_step_conf": 6.88671875, "calib/ece": 0.357741935483871, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.3024193548387097, "calib/gap": 0.008238433005791745, "calib/mean_conf": 0.8698387096774195, "calib/mu_c": 0.8738582677165355, "calib/mu_w": 0.8656198347107438, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.357741935483871, "calib/std_conf": 0.07052206096113496, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6419435028248588, "calib/step_q_c_n": 885.0, "calib/step_q_gap": 0.01714851421438046, "calib/step_q_w": 0.6247949886104783, "calib/step_q_w_n": 878.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2530.0, "completions/max_terminated_length": 2530.0, "completions/mean_length": 575.76171875, "completions/mean_terminated_length": 578.0196533203125, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.0256, "grad_norm": 0.007695337291806936, "kl": 0.018037796020507812, "learning_rate": 4.888888888888889e-06, "loss": 0.0504, "num_tokens": 5648088.0, "reward": 0.48372983932495117, "reward_std": 0.21942198276519775, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6018944978713989, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.07259640097618103, "step": 24 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5281599588530281, "calib/avg_num_step_conf": 6.44921875, "calib/ece": 0.2767843137254902, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.32941176470588235, "calib/gap": 0.008054519737688337, "calib/mean_conf": 0.8807058823529412, "calib/mu_c": 0.883896103896104, "calib/mu_w": 0.8758415841584156, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2767843137254902, "calib/std_conf": 0.054853564103070726, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6362807933194154, "calib/step_q_c_n": 958.0, "calib/step_q_gap": 0.0004799275185496388, "calib/step_q_w": 0.6358008658008658, "calib/step_q_w_n": 693.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 513.18359375, "completions/mean_terminated_length": 513.18359375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.02666666666666667, "grad_norm": 0.007971785962581635, "kl": 0.022886276245117188, "learning_rate": 4.861111111111111e-06, "loss": 0.0247, "num_tokens": 5882687.0, "reward": 0.549470067024231, "reward_std": 0.1518770456314087, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6811434030532837, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.0990467220544815, "step": 25 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.47485461274121066, "calib/avg_num_step_conf": 6.02734375, "calib/ece": 0.26877470355731214, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3359683794466403, "calib/gap": -0.001648162833730038, "calib/mean_conf": 0.8853754940711462, "calib/mu_c": 0.8847435897435896, "calib/mu_w": 0.8863917525773196, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26877470355731214, "calib/std_conf": 0.04970622502840137, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6674917127071823, "calib/step_q_c_n": 905.0, "calib/step_q_gap": -0.0069596979511249435, "calib/step_q_w": 0.6744514106583073, "calib/step_q_w_n": 638.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2619.0, "completions/max_terminated_length": 2619.0, "completions/mean_length": 510.875, "completions/mean_terminated_length": 512.8784790039062, "completions/min_length": 0.0, "completions/min_terminated_length": 225.0, "epoch": 0.027733333333333332, "grad_norm": 0.008650614880025387, "kl": 0.029087066650390625, "learning_rate": 4.833333333333333e-06, "loss": 0.0388, "num_tokens": 6118711.0, "reward": 0.5607102513313293, "reward_std": 0.14517641067504883, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6800421476364136, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.12184715270996094, "step": 26 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5288062015503876, "calib/avg_num_step_conf": 7.20703125, "calib/ece": 0.3875590551181102, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.29133858267716534, "calib/gap": 0.008540775193798589, "calib/mean_conf": 0.8711023622047245, "calib/mu_c": 0.8754400000000001, "calib/mu_w": 0.8668992248062015, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.383267716535433, "calib/std_conf": 0.08867554187431534, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6481046511627906, "calib/step_q_c_n": 860.0, "calib/step_q_gap": 0.037434600401369256, "calib/step_q_w": 0.6106700507614213, "calib/step_q_w_n": 985.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 538.12109375, "completions/mean_terminated_length": 540.2313842773438, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.0288, "grad_norm": 0.007840660400688648, "kl": 0.038330078125, "learning_rate": 4.805555555555556e-06, "loss": -0.0064, "num_tokens": 6361686.0, "reward": 0.5033316016197205, "reward_std": 0.20111700892448425, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5981351137161255, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.11243432015180588, "step": 27 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5401933900995816, "calib/avg_num_step_conf": 6.5625, "calib/ece": 0.21661354581673303, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.350597609561753, "calib/gap": 0.013213306393418889, "calib/mean_conf": 0.8782868525896413, "calib/mu_c": 0.882603550295858, "calib/mu_w": 0.8693902439024391, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21079681274900394, "calib/std_conf": 0.08362226172396235, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6238046511627907, "calib/step_q_c_n": 1075.0, "calib/step_q_gap": 0.028779857774360984, "calib/step_q_w": 0.5950247933884297, "calib/step_q_w_n": 605.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2903.0, "completions/max_terminated_length": 2903.0, "completions/mean_length": 580.86328125, "completions/mean_terminated_length": 583.1412353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.029866666666666666, "grad_norm": 0.007725785952061415, "kl": 0.03627777099609375, "learning_rate": 4.777777777777778e-06, "loss": 0.0077, "num_tokens": 6617331.0, "reward": 0.5768232941627502, "reward_std": 0.19204816222190857, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7224472761154175, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.10229302942752838, "step": 28 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5165031222123104, "calib/avg_num_step_conf": 7.30859375, "calib/ece": 0.36490039840637445, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3904382470119522, "calib/gap": 0.0016420288008153205, "calib/mean_conf": 0.8923107569721116, "calib/mu_c": 0.8930827067669171, "calib/mu_w": 0.8914406779661018, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3636653386454183, "calib/std_conf": 0.05237239323852384, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6024052004333694, "calib/step_q_c_n": 923.0, "calib/step_q_gap": 0.03731026372450874, "calib/step_q_w": 0.5650949367088607, "calib/step_q_w_n": 948.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2982.0, "completions/max_terminated_length": 2982.0, "completions/mean_length": 651.48828125, "completions/mean_terminated_length": 651.48828125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.030933333333333334, "grad_norm": 0.006872882600873709, "kl": 0.035243988037109375, "learning_rate": 4.75e-06, "loss": 0.0606, "num_tokens": 6891240.0, "reward": 0.5300413966178894, "reward_std": 0.1835039108991623, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6055496335029602, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.15453320741653442, "step": 29 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4484031132581857, "calib/avg_num_step_conf": 6.85546875, "calib/ece": 0.32394308943089434, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3170731707317073, "calib/gap": -0.00435587761674705, "calib/mean_conf": 0.8779268292682927, "calib/mu_c": 0.8760144927536232, "calib/mu_w": 0.8803703703703702, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3204471544715448, "calib/std_conf": 0.085953350056187, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5931235697940505, "calib/step_q_c_n": 874.0, "calib/step_q_gap": 0.03481482972594607, "calib/step_q_w": 0.5583087400681044, "calib/step_q_w_n": 881.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2445.0, "completions/max_terminated_length": 2445.0, "completions/mean_length": 614.6875, "completions/mean_terminated_length": 624.4444580078125, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.032, "grad_norm": 0.006772920954972506, "kl": 0.038608551025390625, "learning_rate": 4.722222222222222e-06, "loss": 0.0047, "num_tokens": 7155584.0, "reward": 0.542879581451416, "reward_std": 0.2689790725708008, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6185808181762695, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.16717834770679474, "step": 30 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5401891252955082, "calib/avg_num_step_conf": 7.625, "calib/ece": 0.4343650793650794, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.38095238095238093, "calib/gap": 0.031278512555108495, "calib/mean_conf": 0.8748412698412698, "calib/mu_c": 0.8923423423423424, "calib/mu_w": 0.861063829787234, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4343650793650794, "calib/std_conf": 0.11757965302184178, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5797603195739014, "calib/step_q_c_n": 751.0, "calib/step_q_gap": 0.029137505252502582, "calib/step_q_w": 0.5506228143213988, "calib/step_q_w_n": 1201.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2770.0, "completions/max_terminated_length": 2770.0, "completions/mean_length": 630.57421875, "completions/mean_terminated_length": 633.047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 218.0, "epoch": 0.03306666666666667, "grad_norm": 0.006630671210587025, "kl": 0.04232025146484375, "learning_rate": 4.694444444444445e-06, "loss": 0.0466, "num_tokens": 7422923.0, "reward": 0.5082546472549438, "reward_std": 0.21992552280426025, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.5576117038726807, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.17530381679534912, "step": 31 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5533968090581575, "calib/avg_num_step_conf": 7.0859375, "calib/ece": 0.3418400000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.356, "calib/gap": 0.021060216160576406, "calib/mean_conf": 0.8778400000000001, "calib/mu_c": 0.8876119402985074, "calib/mu_w": 0.866551724137931, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3418400000000001, "calib/std_conf": 0.08215189833473113, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5951980324074073, "calib/step_q_c_n": 864.0, "calib/step_q_gap": 0.03201908503898632, "calib/step_q_w": 0.563178947368421, "calib/step_q_w_n": 950.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 590.328125, "completions/mean_terminated_length": 597.3280639648438, "completions/min_length": 0.0, "completions/min_terminated_length": 222.0, "epoch": 0.034133333333333335, "grad_norm": 0.007396162953227758, "kl": 0.0458984375, "learning_rate": 4.666666666666667e-06, "loss": 0.0152, "num_tokens": 7680751.0, "reward": 0.5492283701896667, "reward_std": 0.2305486500263214, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6220609545707703, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.1771770715713501, "step": 32 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5556763810352328, "calib/avg_num_step_conf": 6.765625, "calib/ece": 0.40216535433070866, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.33858267716535434, "calib/gap": 0.013349282296650666, "calib/mean_conf": 0.878464566929134, "calib/mu_c": 0.8854545454545455, "calib/mu_w": 0.8721052631578948, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4021259842519685, "calib/std_conf": 0.06160904106823838, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5805651105651106, "calib/step_q_c_n": 814.0, "calib/step_q_gap": 0.0001729537023654748, "calib/step_q_w": 0.5803921568627451, "calib/step_q_w_n": 918.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2244.0, "completions/max_terminated_length": 2244.0, "completions/mean_length": 577.65234375, "completions/mean_terminated_length": 579.9176635742188, "completions/min_length": 0.0, "completions/min_terminated_length": 191.0, "epoch": 0.0352, "grad_norm": 0.006979378871619701, "kl": 0.048908233642578125, "learning_rate": 4.638888888888889e-06, "loss": 0.0604, "num_tokens": 7935502.0, "reward": 0.5247606039047241, "reward_std": 0.21103210747241974, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.5871254205703735, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.16942700743675232, "step": 33 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4795370890261401, "calib/avg_num_step_conf": 6.77734375, "calib/ece": 0.3318503937007874, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.29133858267716534, "calib/gap": -0.0013818703599725035, "calib/mean_conf": 0.8712204724409448, "calib/mu_c": 0.8705839416058394, "calib/mu_w": 0.8719658119658119, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3318503937007874, "calib/std_conf": 0.06628860941639579, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5580782918149466, "calib/step_q_c_n": 843.0, "calib/step_q_gap": 0.03725990616472241, "calib/step_q_w": 0.5208183856502242, "calib/step_q_w_n": 892.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 522.265625, "completions/mean_terminated_length": 524.3137817382812, "completions/min_length": 0.0, "completions/min_terminated_length": 208.0, "epoch": 0.03626666666666667, "grad_norm": 0.010433186776936054, "kl": 0.05446624755859375, "learning_rate": 4.611111111111112e-06, "loss": 0.0187, "num_tokens": 8174314.0, "reward": 0.5656172037124634, "reward_std": 0.20620962977409363, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6313730478286743, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.194392591714859, "step": 34 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5385248496359607, "calib/avg_num_step_conf": 6.125, "calib/ece": 0.30519841269841275, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.20238095238095238, "calib/gap": 0.024358974358974494, "calib/mean_conf": 0.8409126984126986, "calib/mu_c": 0.8522222222222223, "calib/mu_w": 0.8278632478632478, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.30519841269841275, "calib/std_conf": 0.10666834223693417, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.538297619047619, "calib/step_q_c_n": 840.0, "calib/step_q_gap": 0.006965201465201409, "calib/step_q_w": 0.5313324175824176, "calib/step_q_w_n": 728.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2786.0, "completions/max_terminated_length": 2786.0, "completions/mean_length": 603.01171875, "completions/mean_terminated_length": 607.7598266601562, "completions/min_length": 0.0, "completions/min_terminated_length": 243.0, "epoch": 0.037333333333333336, "grad_norm": 0.007517626043409109, "kl": 0.044712066650390625, "learning_rate": 4.583333333333333e-06, "loss": 0.0204, "num_tokens": 8437941.0, "reward": 0.5808595418930054, "reward_std": 0.2326761782169342, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.646169126033783, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.21398743987083435, "step": 35 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5064760508308895, "calib/avg_num_step_conf": 6.26953125, "calib/ece": 0.12333333333333321, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.16666666666666666, "calib/gap": -0.007086999022482887, "calib/mean_conf": 0.8449206349206348, "calib/mu_c": 0.8430645161290322, "calib/mu_w": 0.8501515151515151, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11507936507936498, "calib/std_conf": 0.08589164484142289, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5507913043478261, "calib/step_q_c_n": 1150.0, "calib/step_q_gap": 0.026351743908265668, "calib/step_q_w": 0.5244395604395604, "calib/step_q_w_n": 455.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2036.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 530.5859375, "completions/mean_terminated_length": 532.6666870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.0384, "grad_norm": 0.007462262641638517, "kl": 0.0557708740234375, "learning_rate": 4.555555555555556e-06, "loss": 0.0302, "num_tokens": 8676483.0, "reward": 0.6908488273620605, "reward_std": 0.18387025594711304, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7709254026412964, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.2701471447944641, "step": 36 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5613636363636365, "calib/avg_num_step_conf": 6.5390625, "calib/ece": 0.3771370967741935, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.0846774193548387, "calib/gap": 0.009750988142292472, "calib/mean_conf": 0.8058467741935483, "calib/mu_c": 0.8112727272727271, "calib/mu_w": 0.8015217391304347, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36971774193548385, "calib/std_conf": 0.10765128472293026, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5435054773082941, "calib/step_q_c_n": 639.0, "calib/step_q_gap": 0.10539919711505746, "calib/step_q_w": 0.43810628019323666, "calib/step_q_w_n": 1035.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 550.25390625, "completions/mean_terminated_length": 556.7786865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 218.0, "epoch": 0.039466666666666664, "grad_norm": 0.007464609574526548, "kl": 0.054592132568359375, "learning_rate": 4.527777777777778e-06, "loss": 0.0323, "num_tokens": 8924444.0, "reward": 0.5121106505393982, "reward_std": 0.1716907024383545, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.5959277153015137, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.14860600233078003, "step": 37 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5289828089120587, "calib/avg_num_step_conf": 6.1640625, "calib/ece": 0.26905511811023625, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.05905511811023622, "calib/gap": 0.013293613852169095, "calib/mean_conf": 0.7641732283464567, "calib/mu_c": 0.7706106870229008, "calib/mu_w": 0.7573170731707317, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.258740157480315, "calib/std_conf": 0.1267115326701758, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49724181360201514, "calib/step_q_c_n": 794.0, "calib/step_q_gap": 0.03290507890813765, "calib/step_q_w": 0.4643367346938775, "calib/step_q_w_n": 784.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1547.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 503.859375, "completions/mean_terminated_length": 507.8267822265625, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.04053333333333333, "grad_norm": 0.0074347201734781265, "kl": 0.05693817138671875, "learning_rate": 4.5e-06, "loss": -0.0008, "num_tokens": 9160320.0, "reward": 0.5768137574195862, "reward_std": 0.18113070726394653, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.673811674118042, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.1790345162153244, "step": 38 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4705807680101555, "calib/avg_num_step_conf": 6.109375, "calib/ece": 0.26968253968253975, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.04365079365079365, "calib/gap": -0.014816883529038316, "calib/mean_conf": 0.746031746031746, "calib/mu_c": 0.7392700729927008, "calib/mu_w": 0.7540869565217391, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23603174603174615, "calib/std_conf": 0.15515777615100693, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.48040342298288513, "calib/step_q_c_n": 818.0, "calib/step_q_gap": 0.024183181696021905, "calib/step_q_w": 0.4562202412868632, "calib/step_q_w_n": 746.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2499.0, "completions/max_terminated_length": 2499.0, "completions/mean_length": 525.46875, "completions/mean_terminated_length": 531.6996459960938, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.0416, "grad_norm": 0.006516232155263424, "kl": 0.05139923095703125, "learning_rate": 4.472222222222223e-06, "loss": 0.0296, "num_tokens": 9400928.0, "reward": 0.5995521545410156, "reward_std": 0.1989758312702179, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6689039468765259, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.2262941598892212, "step": 39 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5063053641732284, "calib/avg_num_step_conf": 5.8203125, "calib/ece": 0.21466666666666673, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.011764705882352941, "calib/gap": 0.016117125984251968, "calib/mean_conf": 0.6949019607843137, "calib/mu_c": 0.702992125984252, "calib/mu_w": 0.686875, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20576470588235302, "calib/std_conf": 0.15402027558173298, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47658192090395485, "calib/step_q_c_n": 708.0, "calib/step_q_gap": 0.03949752192697281, "calib/step_q_w": 0.43708439897698204, "calib/step_q_w_n": 782.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2304.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 556.90234375, "completions/mean_terminated_length": 556.90234375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.042666666666666665, "grad_norm": 0.08801660686731339, "kl": 0.13840866088867188, "learning_rate": 4.444444444444444e-06, "loss": -0.0039, "num_tokens": 9650255.0, "reward": 0.5951794385910034, "reward_std": 0.1751280426979065, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6928679347038269, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.19905337691307068, "step": 40 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4886706258565554, "calib/avg_num_step_conf": 5.82421875, "calib/ece": 0.1355905511811023, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.01968503937007874, "calib/gap": -0.0021306532663315503, "calib/mean_conf": 0.6843307086614173, "calib/mu_c": 0.6838693467336684, "calib/mu_w": 0.6859999999999999, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.018228346456692867, "calib/std_conf": 0.15763053858555895, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43566401746724887, "calib/step_q_c_n": 1145.0, "calib/step_q_gap": 0.013929913421006024, "calib/step_q_w": 0.42173410404624284, "calib/step_q_w_n": 346.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 482.26171875, "completions/mean_terminated_length": 482.26171875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.04373333333333333, "grad_norm": 0.006632791832089424, "kl": 0.06319427490234375, "learning_rate": 4.416666666666667e-06, "loss": 0.0212, "num_tokens": 9880962.0, "reward": 0.7019416093826294, "reward_std": 0.1809065341949463, "rewards/accuracy_reward_step": 0.77734375, "rewards/final_brier_reward_step": 0.7887437343597412, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.26123327016830444, "step": 41 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6127613462519124, "calib/avg_num_step_conf": 5.73046875, "calib/ece": 0.08677165354330707, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.011811023622047244, "calib/gap": 0.06857470678225386, "calib/mean_conf": 0.6425984251968504, "calib/mu_c": 0.6712162162162162, "calib/mu_w": 0.6026415094339623, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07334645669291337, "calib/std_conf": 0.16609556398493233, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44271764705882355, "calib/step_q_c_n": 850.0, "calib/step_q_gap": 0.052199008485079634, "calib/step_q_w": 0.3905186385737439, "calib/step_q_w_n": 617.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2762.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 450.9921875, "completions/mean_terminated_length": 450.9921875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.0448, "grad_norm": 0.007387762889266014, "kl": 0.06370925903320312, "learning_rate": 4.388888888888889e-06, "loss": 0.0156, "num_tokens": 10100784.0, "reward": 0.642842173576355, "reward_std": 0.16730615496635437, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7530773878097534, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.21854455769062042, "step": 42 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5963322884012539, "calib/avg_num_step_conf": 5.73046875, "calib/ece": 0.08086274509803923, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.07805015673981208, "calib/mean_conf": 0.6107450980392156, "calib/mu_c": 0.6444137931034484, "calib/mu_w": 0.5663636363636363, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0614901960784314, "calib/std_conf": 0.1649379970416994, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4295169082125604, "calib/step_q_c_n": 828.0, "calib/step_q_gap": 0.05078451384636323, "calib/step_q_w": 0.37873239436619716, "calib/step_q_w_n": 639.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1838.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 497.91015625, "completions/mean_terminated_length": 499.8627624511719, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.04586666666666667, "grad_norm": 0.006303076166659594, "kl": 0.05922698974609375, "learning_rate": 4.361111111111112e-06, "loss": 0.023, "num_tokens": 10333473.0, "reward": 0.6634681820869446, "reward_std": 0.1730535626411438, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.76103675365448, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.2526184022426605, "step": 43 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5205368793885541, "calib/avg_num_step_conf": 6.27734375, "calib/ece": 0.17744094488188975, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.011811023622047244, "calib/gap": 0.020358540980550566, "calib/mean_conf": 0.6124803149606299, "calib/mu_c": 0.6231404958677687, "calib/mu_w": 0.6027819548872181, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15677165354330705, "calib/std_conf": 0.18767472071006683, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41140000000000004, "calib/step_q_c_n": 732.0, "calib/step_q_gap": 0.037411428571428595, "calib/step_q_w": 0.37398857142857145, "calib/step_q_w_n": 875.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 524.71484375, "completions/mean_terminated_length": 526.7725830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.046933333333333334, "grad_norm": 0.006215093191713095, "kl": 0.05739593505859375, "learning_rate": 4.333333333333334e-06, "loss": 0.0134, "num_tokens": 10574120.0, "reward": 0.5865083336830139, "reward_std": 0.17506547272205353, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7014456987380981, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.1786021888256073, "step": 44 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6223754110801922, "calib/avg_num_step_conf": 6.04296875, "calib/ece": 0.09706349206349207, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.015873015873015872, "calib/gap": 0.08617252719453583, "calib/mean_conf": 0.5691269841269841, "calib/mu_c": 0.6094776119402986, "calib/mu_w": 0.5233050847457628, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.06722222222222222, "calib/std_conf": 0.19493393199565195, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42671205846528626, "calib/step_q_c_n": 821.0, "calib/step_q_gap": 0.05219236149558931, "calib/step_q_w": 0.37451969696969695, "calib/step_q_w_n": 726.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2819.0, "completions/max_terminated_length": 2819.0, "completions/mean_length": 490.828125, "completions/mean_terminated_length": 490.828125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.048, "grad_norm": 0.005764865782111883, "kl": 0.06572723388671875, "learning_rate": 4.305555555555556e-06, "loss": 0.0445, "num_tokens": 10804820.0, "reward": 0.6049693822860718, "reward_std": 0.17337967455387115, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7427343726158142, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.1656419336795807, "step": 45 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4638007572790182, "calib/avg_num_step_conf": 6.61328125, "calib/ece": 0.1714457831325301, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.020080321285140562, "calib/gap": -0.017647865256560813, "calib/mean_conf": 0.5830120481927712, "calib/mu_c": 0.5751449275362319, "calib/mu_w": 0.5927927927927927, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10012048192771085, "calib/std_conf": 0.18503707393477412, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37555155875299767, "calib/step_q_c_n": 834.0, "calib/step_q_gap": 0.030021873071973293, "calib/step_q_w": 0.3455296856810244, "calib/step_q_w_n": 859.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2927.0, "completions/max_terminated_length": 2927.0, "completions/mean_length": 530.265625, "completions/mean_terminated_length": 530.265625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.04906666666666667, "grad_norm": 0.005777245853096247, "kl": 0.0604095458984375, "learning_rate": 4.277777777777778e-06, "loss": 0.0352, "num_tokens": 11045336.0, "reward": 0.5770740509033203, "reward_std": 0.1873425543308258, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6897605657577515, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.16204379498958588, "step": 46 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5565924142083082, "calib/avg_num_step_conf": 6.47265625, "calib/ece": 0.13240000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.02, "calib/gap": 0.060226102080406574, "calib/mean_conf": 0.59072, "calib/mu_c": 0.6145695364238409, "calib/mu_w": 0.5543434343434344, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.059560000000000016, "calib/std_conf": 0.19188194704036127, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39741200828157347, "calib/step_q_c_n": 966.0, "calib/step_q_gap": 0.0360314004668123, "calib/step_q_w": 0.36138060781476117, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 524.9609375, "completions/mean_terminated_length": 529.094482421875, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.050133333333333335, "grad_norm": 0.0055895112454891205, "kl": 0.06354522705078125, "learning_rate": 4.25e-06, "loss": 0.0356, "num_tokens": 11285702.0, "reward": 0.6485556364059448, "reward_std": 0.1632830947637558, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7349914312362671, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.24883851408958435, "step": 47 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5178482587064677, "calib/avg_num_step_conf": 5.52734375, "calib/ece": 0.13555118110236217, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.023622047244094488, "calib/gap": 0.031546019900497435, "calib/mean_conf": 0.5625590551181102, "calib/mu_c": 0.5774626865671642, "calib/mu_w": 0.5459166666666667, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08527559055118106, "calib/std_conf": 0.18684741147233042, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42755827338129493, "calib/step_q_c_n": 695.0, "calib/step_q_gap": 0.05981382893685039, "calib/step_q_w": 0.36774444444444454, "calib/step_q_w_n": 720.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2358.0, "completions/max_terminated_length": 2358.0, "completions/mean_length": 471.73828125, "completions/mean_terminated_length": 471.73828125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.0512, "grad_norm": 0.005875094328075647, "kl": 0.07415771484375, "learning_rate": 4.222222222222223e-06, "loss": 0.0597, "num_tokens": 11510155.0, "reward": 0.59670490026474, "reward_std": 0.19118118286132812, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7246417999267578, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.16564300656318665, "step": 48 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.509004270066217, "calib/avg_num_step_conf": 5.84375, "calib/ece": 0.14707031250000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.015625, "calib/gap": 0.009915836376013454, "calib/mean_conf": 0.5952734375000001, "calib/mu_c": 0.5996503496503497, "calib/mu_w": 0.5897345132743362, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09187500000000001, "calib/std_conf": 0.1655339473701198, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3988808664259928, "calib/step_q_c_n": 831.0, "calib/step_q_gap": 0.007707934095165736, "calib/step_q_w": 0.39117293233082706, "calib/step_q_w_n": 665.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1834.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 464.66015625, "completions/mean_terminated_length": 466.4823913574219, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.05226666666666667, "grad_norm": 0.005934995133429766, "kl": 0.072479248046875, "learning_rate": 4.194444444444445e-06, "loss": -0.0048, "num_tokens": 11733644.0, "reward": 0.6388311982154846, "reward_std": 0.15334394574165344, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7295762300491333, "rewards/format_reward_step": 1.0, "rewards/step_margin_reward": 0.23636746406555176, "step": 49 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5598329635912828, "calib/avg_num_step_conf": 5.84765625, "calib/ece": 0.09909803921568625, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.00784313725490196, "calib/gap": 0.03743051024402966, "calib/mean_conf": 0.6024705882352941, "calib/mu_c": 0.6167088607594936, "calib/mu_w": 0.579278350515464, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.04098039215686272, "calib/std_conf": 0.1625114599809631, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.43825740318906603, "calib/step_q_c_n": 878.0, "calib/step_q_gap": 0.026513138245608803, "calib/step_q_w": 0.4117442649434572, "calib/step_q_w_n": 619.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1448.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 478.62890625, "completions/mean_terminated_length": 480.50592041015625, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.05333333333333334, "grad_norm": 0.006061997264623642, "kl": 0.0747833251953125, "learning_rate": 4.166666666666667e-06, "loss": -0.0099, "num_tokens": 11961533.0, "reward": 0.6497402191162109, "reward_std": 0.186766117811203, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7475171685218811, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.23086941242218018, "step": 50 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4809847748623259, "calib/avg_num_step_conf": 5.921875, "calib/ece": 0.12714285714285706, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.01984126984126984, "calib/gap": 0.016517006802721057, "calib/mean_conf": 0.6237301587301588, "calib/mu_c": 0.6306122448979592, "calib/mu_w": 0.6140952380952381, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08376984126984119, "calib/std_conf": 0.168413390331149, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42119133574007217, "calib/step_q_c_n": 831.0, "calib/step_q_gap": 0.03764296591038846, "calib/step_q_w": 0.3835483698296837, "calib/step_q_w_n": 685.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2825.0, "completions/max_terminated_length": 2825.0, "completions/mean_length": 529.2421875, "completions/mean_terminated_length": 529.2421875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.0544, "grad_norm": 0.00549277663230896, "kl": 0.06853485107421875, "learning_rate": 4.138888888888889e-06, "loss": 0.0231, "num_tokens": 12206315.0, "reward": 0.6381635665893555, "reward_std": 0.18898846209049225, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7234945297241211, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.24033261835575104, "step": 51 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5987903225806452, "calib/avg_num_step_conf": 5.54296875, "calib/ece": 0.15220472440944885, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.023622047244094488, "calib/gap": 0.08058823529411774, "calib/mean_conf": 0.5884251968503939, "calib/mu_c": 0.6100000000000001, "calib/mu_w": 0.5294117647058824, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.004173228346456682, "calib/std_conf": 0.18439056971484308, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.41865711556829033, "calib/step_q_c_n": 1047.0, "calib/step_q_gap": -0.0016654650768709311, "calib/step_q_w": 0.42032258064516126, "calib/step_q_w_n": 372.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 458.71484375, "completions/mean_terminated_length": 460.5137634277344, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.055466666666666664, "grad_norm": 0.006017228588461876, "kl": 0.0811004638671875, "learning_rate": 4.111111111111111e-06, "loss": -0.0145, "num_tokens": 12431698.0, "reward": 0.721343994140625, "reward_std": 0.17622464895248413, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.7716425657272339, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.32729530334472656, "step": 52 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.603898966994205, "calib/avg_num_step_conf": 6.33203125, "calib/ece": 0.08941176470588239, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00784313725490196, "calib/gap": 0.06378117913832193, "calib/mean_conf": 0.5948235294117648, "calib/mu_c": 0.6218367346938776, "calib/mu_w": 0.5580555555555556, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05388235294117648, "calib/std_conf": 0.168236144245761, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4338461538461539, "calib/step_q_c_n": 962.0, "calib/step_q_gap": 0.051437200887125056, "calib/step_q_w": 0.3824089529590288, "calib/step_q_w_n": 659.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2627.0, "completions/max_terminated_length": 2627.0, "completions/mean_length": 517.9453125, "completions/mean_terminated_length": 517.9453125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.05653333333333333, "grad_norm": 0.0055801658891141415, "kl": 0.07416534423828125, "learning_rate": 4.083333333333334e-06, "loss": 0.0473, "num_tokens": 12670116.0, "reward": 0.6302124857902527, "reward_std": 0.16622228920459747, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7553898096084595, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.19097262620925903, "step": 53 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.591922388881244, "calib/avg_num_step_conf": 5.55859375, "calib/ece": 0.10635294117647061, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.043137254901960784, "calib/gap": 0.061826063024632005, "calib/mean_conf": 0.6305098039215686, "calib/mu_c": 0.6513609467455621, "calib/mu_w": 0.5895348837209301, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.037058823529411755, "calib/std_conf": 0.16956499968911448, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4601995515695067, "calib/step_q_c_n": 892.0, "calib/step_q_gap": 0.05714493763353684, "calib/step_q_w": 0.40305461393596986, "calib/step_q_w_n": 531.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2176.0, "completions/max_terminated_length": 2176.0, "completions/mean_length": 460.70703125, "completions/mean_terminated_length": 460.70703125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.0576, "grad_norm": 0.006231546867638826, "kl": 0.08040618896484375, "learning_rate": 4.055555555555556e-06, "loss": 0.0117, "num_tokens": 12894289.0, "reward": 0.6591579914093018, "reward_std": 0.15599699318408966, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7713078260421753, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.21575812995433807, "step": 54 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5949283559577677, "calib/avg_num_step_conf": 5.6875, "calib/ece": 0.12999999999999998, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.039525691699604744, "calib/gap": 0.0689335093011566, "calib/mean_conf": 0.6536363636363637, "calib/mu_c": 0.685514705882353, "calib/mu_w": 0.6165811965811964, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12304347826086956, "calib/std_conf": 0.17501501398558267, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45723473541383985, "calib/step_q_c_n": 737.0, "calib/step_q_gap": 0.006887030267803684, "calib/step_q_w": 0.45034770514603617, "calib/step_q_w_n": 719.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2976.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 529.046875, "completions/mean_terminated_length": 529.046875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.058666666666666666, "grad_norm": 0.00565149262547493, "kl": 0.08214569091796875, "learning_rate": 4.027777777777779e-06, "loss": 0.0722, "num_tokens": 13137549.0, "reward": 0.6621849536895752, "reward_std": 0.2255595624446869, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7321434020996094, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.28910163044929504, "step": 55 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5978233830845772, "calib/avg_num_step_conf": 6.20703125, "calib/ece": 0.2053543307086614, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.031496062992125984, "calib/gap": 0.050671641791044775, "calib/mean_conf": 0.6682677165354332, "calib/mu_c": 0.6950000000000001, "calib/mu_w": 0.6443283582089553, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20059055118110233, "calib/std_conf": 0.14187215320638408, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4754463130659767, "calib/step_q_c_n": 773.0, "calib/step_q_gap": 0.018247783654212035, "calib/step_q_w": 0.45719852941176464, "calib/step_q_w_n": 816.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 531.5546875, "completions/mean_terminated_length": 533.6392211914062, "completions/min_length": 0.0, "completions/min_terminated_length": 210.0, "epoch": 0.05973333333333333, "grad_norm": 0.005589254200458527, "kl": 0.071380615234375, "learning_rate": 4.000000000000001e-06, "loss": 0.0061, "num_tokens": 13380467.0, "reward": 0.6137272119522095, "reward_std": 0.22647228837013245, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7119367122650146, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.2225489318370819, "step": 56 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5939275568181819, "calib/avg_num_step_conf": 6.32421875, "calib/ece": 0.06612903225806455, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.03225806451612903, "calib/gap": 0.05914204545454538, "calib/mean_conf": 0.6889516129032257, "calib/mu_c": 0.7099375, "calib/mu_w": 0.6507954545454546, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.05495967741935488, "calib/std_conf": 0.14916335788694293, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5133693599160546, "calib/step_q_c_n": 953.0, "calib/step_q_gap": 0.09219818874488345, "calib/step_q_w": 0.42117117117117114, "calib/step_q_w_n": 666.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2994.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 543.9296875, "completions/mean_terminated_length": 552.5635375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.0608, "grad_norm": 0.005411534570157528, "kl": 0.07976531982421875, "learning_rate": 3.972222222222223e-06, "loss": -0.0103, "num_tokens": 13626505.0, "reward": 0.6742805242538452, "reward_std": 0.19319787621498108, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7497960925102234, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.28001493215560913, "step": 57 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5088778409090909, "calib/avg_num_step_conf": 6.8515625, "calib/ece": 0.18978714859437754, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.0321285140562249, "calib/gap": 0.008725271177685956, "calib/mean_conf": 0.6730803212851405, "calib/mu_c": 0.6773203125, "calib/mu_w": 0.6685950413223141, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17440562248995986, "calib/std_conf": 0.14457341200973897, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4824614443084455, "calib/step_q_c_n": 817.0, "calib/step_q_gap": 0.0501206403240983, "calib/step_q_w": 0.4323408039843472, "calib/step_q_w_n": 937.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2779.0, "completions/max_terminated_length": 2779.0, "completions/mean_length": 626.45703125, "completions/mean_terminated_length": 626.45703125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.06186666666666667, "grad_norm": 0.005182043649256229, "kl": 0.07784271240234375, "learning_rate": 3.944444444444445e-06, "loss": 0.0669, "num_tokens": 13893198.0, "reward": 0.5870130658149719, "reward_std": 0.23078709840774536, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6881850957870483, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.19209106266498566, "step": 58 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.47987708168120535, "calib/avg_num_step_conf": 6.2734375, "calib/ece": 0.16229249011857705, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.09486166007905138, "calib/gap": -0.008536214644462081, "calib/mean_conf": 0.726798418972332, "calib/mu_c": 0.7235256410256411, "calib/mu_w": 0.7320618556701032, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13624505928853756, "calib/std_conf": 0.13775596684232014, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5421439073514602, "calib/step_q_c_n": 993.0, "calib/step_q_gap": 0.05318795302845858, "calib/step_q_w": 0.48895595432300165, "calib/step_q_w_n": 613.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2146.0, "completions/max_terminated_length": 2146.0, "completions/mean_length": 585.59765625, "completions/mean_terminated_length": 585.59765625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.06293333333333333, "grad_norm": 0.005232793744653463, "kl": 0.0828094482421875, "learning_rate": 3.916666666666667e-06, "loss": 0.0239, "num_tokens": 14149359.0, "reward": 0.602554202079773, "reward_std": 0.23856589198112488, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7199031114578247, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.16567403078079224, "step": 59 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5296653144016228, "calib/avg_num_step_conf": 6.5078125, "calib/ece": 0.18662698412698414, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.06746031746031746, "calib/gap": 0.03103448275862064, "calib/mean_conf": 0.7032142857142857, "calib/mu_c": 0.7175, "calib/mu_w": 0.6864655172413794, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1750793650793651, "calib/std_conf": 0.16750617648611568, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5243504171632896, "calib/step_q_c_n": 839.0, "calib/step_q_gap": 0.04954993348735248, "calib/step_q_w": 0.4748004836759371, "calib/step_q_w_n": 827.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 584.83203125, "completions/mean_terminated_length": 584.83203125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.064, "grad_norm": 0.005824708379805088, "kl": 0.08036041259765625, "learning_rate": 3.88888888888889e-06, "loss": 0.034, "num_tokens": 14407932.0, "reward": 0.6443766355514526, "reward_std": 0.24165666103363037, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.701065182685852, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.2845630347728729, "step": 60 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.569233437391606, "calib/avg_num_step_conf": 7.01953125, "calib/ece": 0.13028225806451615, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.0967741935483871, "calib/gap": 0.031204301075268837, "calib/mean_conf": 0.7464919354838709, "calib/mu_c": 0.7581935483870967, "calib/mu_w": 0.7269892473118279, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12588709677419357, "calib/std_conf": 0.12576616781639505, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5683931458131264, "calib/step_q_c_n": 1031.0, "calib/step_q_gap": 0.07113727113949714, "calib/step_q_w": 0.49725587467362925, "calib/step_q_w_n": 766.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2928.0, "completions/max_terminated_length": 2928.0, "completions/mean_length": 526.8671875, "completions/mean_terminated_length": 528.933349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.06506666666666666, "grad_norm": 0.005149191245436668, "kl": 0.09716796875, "learning_rate": 3.861111111111112e-06, "loss": 0.0752, "num_tokens": 14646874.0, "reward": 0.6565155982971191, "reward_std": 0.217573881149292, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7262473106384277, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.2719402015209198, "step": 61 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5750872717508055, "calib/avg_num_step_conf": 7.09375, "calib/ece": 0.20253061224489796, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.08163265306122448, "calib/gap": 0.03785714285714292, "calib/mean_conf": 0.7369795918367347, "calib/mu_c": 0.7542857142857143, "calib/mu_w": 0.7164285714285714, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19832653061224492, "calib/std_conf": 0.1401655711599996, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5544982290436835, "calib/step_q_c_n": 847.0, "calib/step_q_gap": 0.04672031366700646, "calib/step_q_w": 0.507777915376677, "calib/step_q_w_n": 969.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2984.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 603.61328125, "completions/mean_terminated_length": 605.9804077148438, "completions/min_length": 0.0, "completions/min_terminated_length": 231.0, "epoch": 0.06613333333333334, "grad_norm": 0.005013887770473957, "kl": 0.088714599609375, "learning_rate": 3.833333333333334e-06, "loss": 0.0404, "num_tokens": 14908479.0, "reward": 0.5980898141860962, "reward_std": 0.2827584743499756, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6826468706130981, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.21822020411491394, "step": 62 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6009721756620852, "calib/avg_num_step_conf": 6.85546875, "calib/ece": 0.1386904761904762, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.11904761904761904, "calib/gap": 0.04693865236339256, "calib/mean_conf": 0.7537698412698413, "calib/mu_c": 0.7714649681528662, "calib/mu_w": 0.7245263157894737, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1347222222222222, "calib/std_conf": 0.13496499620850624, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5428584905660377, "calib/step_q_c_n": 1060.0, "calib/step_q_gap": 0.022987986968915464, "calib/step_q_w": 0.5198705035971223, "calib/step_q_w_n": 695.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2199.0, "completions/max_terminated_length": 2199.0, "completions/mean_length": 618.79296875, "completions/mean_terminated_length": 621.2196655273438, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.0672, "grad_norm": 0.016880923882126808, "kl": 0.158935546875, "learning_rate": 3.8055555555555556e-06, "loss": 0.0382, "num_tokens": 15175530.0, "reward": 0.69347083568573, "reward_std": 0.22258234024047852, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7401214838027954, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.3272889256477356, "step": 63 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5192307692307692, "calib/avg_num_step_conf": 6.453125, "calib/ece": 0.1411764705882353, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.08627450980392157, "calib/gap": 0.027687491399477016, "calib/mean_conf": 0.7347450980392157, "calib/mu_c": 0.7440828402366864, "calib/mu_w": 0.7163953488372093, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10658823529411766, "calib/std_conf": 0.14909607944994172, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5333053221288515, "calib/step_q_c_n": 1071.0, "calib/step_q_gap": -0.004749755323816274, "calib/step_q_w": 0.5380550774526678, "calib/step_q_w_n": 581.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2278.0, "completions/max_terminated_length": 2278.0, "completions/mean_length": 591.609375, "completions/mean_terminated_length": 591.609375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.06826666666666667, "grad_norm": 0.0052238283678889275, "kl": 0.091217041015625, "learning_rate": 3.777777777777778e-06, "loss": 0.0254, "num_tokens": 15430758.0, "reward": 0.6832898855209351, "reward_std": 0.23457638919353485, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7575253844261169, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.27858564257621765, "step": 64 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6231354927292019, "calib/avg_num_step_conf": 6.21484375, "calib/ece": 0.21589843750000004, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.18359375, "calib/gap": 0.05692254883604819, "calib/mean_conf": 0.7901171875, "calib/mu_c": 0.8143537414965987, "calib/mu_w": 0.7574311926605505, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21589843750000004, "calib/std_conf": 0.1254102094810859, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5790928495197438, "calib/step_q_c_n": 937.0, "calib/step_q_gap": 0.010912421385187332, "calib/step_q_w": 0.5681804281345565, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 512.546875, "completions/mean_terminated_length": 514.556884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 214.0, "epoch": 0.06933333333333333, "grad_norm": 0.0056617711670696735, "kl": 0.101287841796875, "learning_rate": 3.7500000000000005e-06, "loss": -0.0176, "num_tokens": 15666994.0, "reward": 0.6725325584411621, "reward_std": 0.1979164481163025, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7156343460083008, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.3161495625972748, "step": 65 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5088254593175854, "calib/avg_num_step_conf": 6.92578125, "calib/ece": 0.30008097165991904, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.19433198380566802, "calib/gap": 0.00526443569553825, "calib/mean_conf": 0.7813765182186235, "calib/mu_c": 0.7840833333333334, "calib/mu_w": 0.7788188976377951, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2978137651821863, "calib/std_conf": 0.1266051911588266, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5069146341463415, "calib/step_q_c_n": 820.0, "calib/step_q_gap": 0.021710017147390848, "calib/step_q_w": 0.4852046169989507, "calib/step_q_w_n": 953.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2734.0, "completions/max_terminated_length": 2734.0, "completions/mean_length": 669.875, "completions/mean_terminated_length": 672.5020141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.0704, "grad_norm": 0.005155966151505709, "kl": 0.078948974609375, "learning_rate": 3.7222222222222225e-06, "loss": 0.018, "num_tokens": 15944834.0, "reward": 0.5533797740936279, "reward_std": 0.24173156917095184, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6190582513809204, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.2025451362133026, "step": 66 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5295155709342562, "calib/avg_num_step_conf": 6.4296875, "calib/ece": 0.14752941176470588, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.17254901960784313, "calib/gap": 0.012588235294117567, "calib/mean_conf": 0.7839215686274509, "calib/mu_c": 0.7881176470588235, "calib/mu_w": 0.7755294117647059, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13239215686274514, "calib/std_conf": 0.11805081045261738, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5469237832874196, "calib/step_q_c_n": 1089.0, "calib/step_q_gap": 0.0067981100378685655, "calib/step_q_w": 0.5401256732495511, "calib/step_q_w_n": 557.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 587.02734375, "completions/mean_terminated_length": 589.3294677734375, "completions/min_length": 0.0, "completions/min_terminated_length": 256.0, "epoch": 0.07146666666666666, "grad_norm": 0.005498452577739954, "kl": 0.09470367431640625, "learning_rate": 3.694444444444445e-06, "loss": -0.0151, "num_tokens": 16200121.0, "reward": 0.7042480707168579, "reward_std": 0.19104528427124023, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7488437294960022, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.3284023404121399, "step": 67 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5935358758888171, "calib/avg_num_step_conf": 7.1328125, "calib/ece": 0.2640562248995983, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.1686746987951807, "calib/gap": 0.03897091144149967, "calib/mean_conf": 0.786144578313253, "calib/mu_c": 0.8047692307692308, "calib/mu_w": 0.7657983193277311, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2640562248995983, "calib/std_conf": 0.11682374042131591, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5402944640753827, "calib/step_q_c_n": 849.0, "calib/step_q_gap": 0.07223919283689756, "calib/step_q_w": 0.46805527123848517, "calib/step_q_w_n": 977.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2500.0, "completions/max_terminated_length": 2500.0, "completions/mean_length": 593.8671875, "completions/mean_terminated_length": 598.5433349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 222.0, "epoch": 0.07253333333333334, "grad_norm": 0.006504854653030634, "kl": 0.101654052734375, "learning_rate": 3.6666666666666666e-06, "loss": -0.0014, "num_tokens": 16456239.0, "reward": 0.5798185467720032, "reward_std": 0.25953006744384766, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6659073829650879, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.19841712713241577, "step": 68 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5633442622950819, "calib/avg_num_step_conf": 6.3359375, "calib/ece": 0.2683400809716599, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.15384615384615385, "calib/gap": 0.03597704918032796, "calib/mean_conf": 0.7541700404858298, "calib/mu_c": 0.7723770491803279, "calib/mu_w": 0.7363999999999999, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2642914979757085, "calib/std_conf": 0.15699615584013946, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5594893617021277, "calib/step_q_c_n": 705.0, "calib/step_q_gap": 0.0736187691903138, "calib/step_q_w": 0.4858705925118139, "calib/step_q_w_n": 917.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2742.0, "completions/max_terminated_length": 2742.0, "completions/mean_length": 671.21875, "completions/mean_terminated_length": 679.1779174804688, "completions/min_length": 0.0, "completions/min_terminated_length": 257.0, "epoch": 0.0736, "grad_norm": 0.00525682931765914, "kl": 0.077056884765625, "learning_rate": 3.638888888888889e-06, "loss": 0.0244, "num_tokens": 16732567.0, "reward": 0.5367077589035034, "reward_std": 0.2361733615398407, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6514406204223633, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.13447493314743042, "step": 69 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6777522935779816, "calib/avg_num_step_conf": 6.671875, "calib/ece": 0.2012244897959184, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.10612244897959183, "calib/gap": 0.1016358607663248, "calib/mean_conf": 0.756326530612245, "calib/mu_c": 0.8015441176470588, "calib/mu_w": 0.699908256880734, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2012244897959184, "calib/std_conf": 0.15497062101495002, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5415924276169265, "calib/step_q_c_n": 898.0, "calib/step_q_gap": 0.08201218070334626, "calib/step_q_w": 0.45958024691358024, "calib/step_q_w_n": 810.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2505.0, "completions/max_terminated_length": 2505.0, "completions/mean_length": 679.6015625, "completions/mean_terminated_length": 679.6015625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.07466666666666667, "grad_norm": 0.0051225475035607815, "kl": 0.079864501953125, "learning_rate": 3.6111111111111115e-06, "loss": 0.0327, "num_tokens": 17013537.0, "reward": 0.6205503940582275, "reward_std": 0.21652446687221527, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7026515603065491, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.2423553168773651, "step": 70 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5751824817518248, "calib/avg_num_step_conf": 6.4296875, "calib/ece": 0.2329761904761904, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.15873015873015872, "calib/gap": 0.04162424627102512, "calib/mean_conf": 0.7511507936507936, "calib/mu_c": 0.7701459854014598, "calib/mu_w": 0.7285217391304347, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22023809523809518, "calib/std_conf": 0.14548449802654848, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5440951276102088, "calib/step_q_c_n": 862.0, "calib/step_q_gap": 0.07089359699796394, "calib/step_q_w": 0.4732015306122449, "calib/step_q_w_n": 784.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2704.0, "completions/max_terminated_length": 2704.0, "completions/mean_length": 623.2265625, "completions/mean_terminated_length": 623.2265625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.07573333333333333, "grad_norm": 0.0052217645570635796, "kl": 0.08162689208984375, "learning_rate": 3.5833333333333335e-06, "loss": 0.0507, "num_tokens": 17277491.0, "reward": 0.618299126625061, "reward_std": 0.23210731148719788, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6949383020401001, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.23931613564491272, "step": 71 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6526579111944966, "calib/avg_num_step_conf": 6.359375, "calib/ece": 0.22972332015810273, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.06719367588932806, "calib/gap": 0.062143214509068234, "calib/mean_conf": 0.7435573122529644, "calib/mu_c": 0.7737692307692308, "calib/mu_w": 0.7116260162601625, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22972332015810273, "calib/std_conf": 0.12023360692041232, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5231202046035807, "calib/step_q_c_n": 782.0, "calib/step_q_gap": 0.05049609112840331, "calib/step_q_w": 0.47262411347517735, "calib/step_q_w_n": 846.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 579.73828125, "completions/mean_terminated_length": 579.73828125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.0768, "grad_norm": 0.0058565582148730755, "kl": 0.08962249755859375, "learning_rate": 3.555555555555556e-06, "loss": 0.0249, "num_tokens": 17530312.0, "reward": 0.6693467497825623, "reward_std": 0.21299782395362854, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7056429386138916, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.333831787109375, "step": 72 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.603117666778411, "calib/avg_num_step_conf": 5.9453125, "calib/ece": 0.11738095238095247, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.0873015873015873, "calib/gap": 0.05125712370097213, "calib/mean_conf": 0.7324603174603174, "calib/mu_c": 0.7517834394904459, "calib/mu_w": 0.7005263157894738, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.11341269841269849, "calib/std_conf": 0.13967471568996953, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5286980920314254, "calib/step_q_c_n": 891.0, "calib/step_q_gap": 0.054499993774690114, "calib/step_q_w": 0.47419809825673526, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2978.0, "completions/max_terminated_length": 2978.0, "completions/mean_length": 569.91796875, "completions/mean_terminated_length": 572.1529541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 258.0, "epoch": 0.07786666666666667, "grad_norm": 0.005151054356247187, "kl": 0.07952117919921875, "learning_rate": 3.5277777777777784e-06, "loss": 0.0296, "num_tokens": 17783243.0, "reward": 0.6734863519668579, "reward_std": 0.24319618940353394, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7384711503982544, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.2905329763889313, "step": 73 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5709176788124156, "calib/avg_num_step_conf": 6.03515625, "calib/ece": 0.18209016393442626, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.09016393442622951, "calib/gap": 0.0366491228070176, "calib/mean_conf": 0.7148770491803279, "calib/mu_c": 0.732, "calib/mu_w": 0.6953508771929824, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.18209016393442626, "calib/std_conf": 0.14029676262981772, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.4818831168831168, "calib/step_q_c_n": 770.0, "calib/step_q_gap": 0.04229602010892325, "calib/step_q_w": 0.43958709677419355, "calib/step_q_w_n": 775.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2871.0, "completions/max_terminated_length": 2871.0, "completions/mean_length": 607.51953125, "completions/mean_terminated_length": 609.9019775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.07893333333333333, "grad_norm": 0.005812863353639841, "kl": 0.07952880859375, "learning_rate": 3.5e-06, "loss": 0.0226, "num_tokens": 18042696.0, "reward": 0.6089438199996948, "reward_std": 0.2539913058280945, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6791456937789917, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.24733565747737885, "step": 74 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6744325391230791, "calib/avg_num_step_conf": 6.2421875, "calib/ece": 0.06035294117647065, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.10588235294117647, "calib/gap": 0.08710700690821949, "calib/mean_conf": 0.7347058823529413, "calib/mu_c": 0.7627167630057804, "calib/mu_w": 0.6756097560975609, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.058313725490196144, "calib/std_conf": 0.13564013723136314, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4711592270531401, "calib/step_q_c_n": 1035.0, "calib/step_q_gap": 0.08709119863395715, "calib/step_q_w": 0.38406802841918297, "calib/step_q_w_n": 563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2707.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 558.71484375, "completions/mean_terminated_length": 558.71484375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.08, "grad_norm": 0.005435822065919638, "kl": 0.0779571533203125, "learning_rate": 3.4722222222222224e-06, "loss": 0.0151, "num_tokens": 18290479.0, "reward": 0.7145378589630127, "reward_std": 0.2187555432319641, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7951613664627075, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.29953932762145996, "step": 75 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6992082662372516, "calib/avg_num_step_conf": 5.2734375, "calib/ece": 0.07870078740157484, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.08267716535433071, "calib/gap": 0.08666129898013963, "calib/mean_conf": 0.7011417322834644, "calib/mu_c": 0.7325308641975309, "calib/mu_w": 0.6458695652173913, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07102362204724412, "calib/std_conf": 0.13246852096184245, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5043577430972389, "calib/step_q_c_n": 833.0, "calib/step_q_gap": 0.06422234657886366, "calib/step_q_w": 0.44013539651837524, "calib/step_q_w_n": 517.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1250.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 552.93359375, "completions/mean_terminated_length": 555.1019897460938, "completions/min_length": 0.0, "completions/min_terminated_length": 243.0, "epoch": 0.08106666666666666, "grad_norm": 0.0056373546831309795, "kl": 0.0784912109375, "learning_rate": 3.444444444444445e-06, "loss": 0.0158, "num_tokens": 18535086.0, "reward": 0.7332932949066162, "reward_std": 0.19764763116836548, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7813144326210022, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.35949093103408813, "step": 76 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5792428975159899, "calib/avg_num_step_conf": 5.80078125, "calib/ece": 0.06327935222672058, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.04048582995951417, "calib/gap": 0.04321582626803522, "calib/mean_conf": 0.675587044534413, "calib/mu_c": 0.6897590361445783, "calib/mu_w": 0.6465432098765431, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.03340080971659914, "calib/std_conf": 0.13516656209384506, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.45939362795477906, "calib/step_q_c_n": 973.0, "calib/step_q_gap": 0.07753815920477908, "calib/step_q_w": 0.38185546875, "calib/step_q_w_n": 512.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2483.0, "completions/max_terminated_length": 2483.0, "completions/mean_length": 583.375, "completions/mean_terminated_length": 587.968505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 205.0, "epoch": 0.08213333333333334, "grad_norm": 0.005375280976295471, "kl": 0.07251739501953125, "learning_rate": 3.416666666666667e-06, "loss": 0.0405, "num_tokens": 18789094.0, "reward": 0.7070667743682861, "reward_std": 0.2357901632785797, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.752937912940979, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.3377581536769867, "step": 77 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5703867255452965, "calib/avg_num_step_conf": 5.32421875, "calib/ece": 0.05604743083003952, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.023715415019762844, "calib/gap": 0.04218988358089115, "calib/mean_conf": 0.66300395256917, "calib/mu_c": 0.6786792452830188, "calib/mu_w": 0.6364893617021277, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04529644268774706, "calib/std_conf": 0.13493252589958443, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4545704994192799, "calib/step_q_c_n": 861.0, "calib/step_q_gap": 0.028793606989001075, "calib/step_q_w": 0.42577689243027883, "calib/step_q_w_n": 502.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2450.0, "completions/max_terminated_length": 2450.0, "completions/mean_length": 613.1875, "completions/mean_terminated_length": 613.1875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.0832, "grad_norm": 0.005595955532044172, "kl": 0.071319580078125, "learning_rate": 3.3888888888888893e-06, "loss": 0.038, "num_tokens": 19054094.0, "reward": 0.6896684169769287, "reward_std": 0.17376267910003662, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7557179927825928, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.3025251030921936, "step": 78 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6021731123388582, "calib/avg_num_step_conf": 5.4296875, "calib/ece": 0.05882812499999987, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.01953125, "calib/gap": 0.053334806629834364, "calib/mean_conf": 0.672109375, "calib/mu_c": 0.6877348066298343, "calib/mu_w": 0.6344, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.011953124999999992, "calib/std_conf": 0.12951925739869488, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42610837438423643, "calib/step_q_c_n": 1015.0, "calib/step_q_gap": -0.026398292282430236, "calib/step_q_w": 0.45250666666666667, "calib/step_q_w_n": 375.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 589.1015625, "completions/mean_terminated_length": 591.4118041992188, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.08426666666666667, "grad_norm": 0.005253465846180916, "kl": 0.06707000732421875, "learning_rate": 3.3611111111111117e-06, "loss": -0.0086, "num_tokens": 19311280.0, "reward": 0.7282896041870117, "reward_std": 0.18334491550922394, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7965078353881836, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.319446325302124, "step": 79 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6037854191263283, "calib/avg_num_step_conf": 5.68359375, "calib/ece": 0.03363636363636367, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.02766798418972332, "calib/gap": 0.038896103896103984, "calib/mean_conf": 0.6858893280632411, "calib/mu_c": 0.6977272727272728, "calib/mu_w": 0.6588311688311688, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.011936758893280646, "calib/std_conf": 0.1144936420760257, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.45525229826353425, "calib/step_q_c_n": 979.0, "calib/step_q_gap": 0.05777330666689562, "calib/step_q_w": 0.39747899159663863, "calib/step_q_w_n": 476.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 523.2421875, "completions/mean_terminated_length": 527.3621826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 210.0, "epoch": 0.08533333333333333, "grad_norm": 0.0054220338352024555, "kl": 0.08350372314453125, "learning_rate": 3.3333333333333333e-06, "loss": -0.0257, "num_tokens": 19547390.0, "reward": 0.7184160351753235, "reward_std": 0.1986905336380005, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7822699546813965, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.3194058835506439, "step": 80 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6122504785343177, "calib/avg_num_step_conf": 5.49609375, "calib/ece": 0.05792828685258968, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.05179282868525897, "calib/gap": 0.05364027891714529, "calib/mean_conf": 0.6771314741035855, "calib/mu_c": 0.6967924528301888, "calib/mu_w": 0.6431521739130435, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05079681274900403, "calib/std_conf": 0.1409322215857465, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43097772277227725, "calib/step_q_c_n": 808.0, "calib/step_q_gap": 0.03289758921635072, "calib/step_q_w": 0.39808013355592653, "calib/step_q_w_n": 599.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2711.0, "completions/max_terminated_length": 2711.0, "completions/mean_length": 577.1875, "completions/mean_terminated_length": 584.0316162109375, "completions/min_length": 0.0, "completions/min_terminated_length": 215.0, "epoch": 0.0864, "grad_norm": 0.005068330559879541, "kl": 0.06775665283203125, "learning_rate": 3.3055555555555558e-06, "loss": 0.0248, "num_tokens": 19801398.0, "reward": 0.6715902090072632, "reward_std": 0.2226143777370453, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7558960914611816, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.2669718861579895, "step": 81 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5481099656357388, "calib/avg_num_step_conf": 4.96875, "calib/ece": 0.11867187499999998, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.05078125, "calib/gap": 0.043197821435518424, "calib/mean_conf": 0.6825, "calib/mu_c": 0.6988679245283019, "calib/mu_w": 0.6556701030927835, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.09003906249999999, "calib/std_conf": 0.14485984605818136, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4361780104712042, "calib/step_q_c_n": 764.0, "calib/step_q_gap": 0.03580399472317275, "calib/step_q_w": 0.40037401574803144, "calib/step_q_w_n": 508.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 491.01171875, "completions/mean_terminated_length": 492.9372863769531, "completions/min_length": 0.0, "completions/min_terminated_length": 210.0, "epoch": 0.08746666666666666, "grad_norm": 0.006127932574599981, "kl": 0.079132080078125, "learning_rate": 3.277777777777778e-06, "loss": 0.0023, "num_tokens": 20032649.0, "reward": 0.6537767648696899, "reward_std": 0.2364278882741928, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7447081804275513, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.24253278970718384, "step": 82 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6137151137151138, "calib/avg_num_step_conf": 5.140625, "calib/ece": 0.14070866141732277, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.05905511811023622, "calib/gap": 0.06377748377748393, "calib/mean_conf": 0.695275590551181, "calib/mu_c": 0.7231468531468531, "calib/mu_w": 0.6593693693693692, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.13649606299212594, "calib/std_conf": 0.1493704730630403, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.43745479833101525, "calib/step_q_c_n": 719.0, "calib/step_q_gap": 0.07338444657222126, "calib/step_q_w": 0.364070351758794, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 590.90625, "completions/mean_terminated_length": 595.55908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.08853333333333334, "grad_norm": 0.00493778008967638, "kl": 0.06632232666015625, "learning_rate": 3.2500000000000002e-06, "loss": -0.0139, "num_tokens": 20291185.0, "reward": 0.6520576477050781, "reward_std": 0.19987721741199493, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7316687107086182, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.26463404297828674, "step": 83 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6889215686274509, "calib/avg_num_step_conf": 4.86328125, "calib/ece": 0.08845238095238102, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.047619047619047616, "calib/gap": 0.10650588235294134, "calib/mean_conf": 0.6836904761904763, "calib/mu_c": 0.7268000000000001, "calib/mu_w": 0.6202941176470588, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08845238095238102, "calib/std_conf": 0.15513777615989194, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4405128205128205, "calib/step_q_c_n": 741.0, "calib/step_q_gap": 0.026928164428164425, "calib/step_q_w": 0.41358465608465605, "calib/step_q_w_n": 504.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 516.78515625, "completions/mean_terminated_length": 518.811767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 193.0, "epoch": 0.0896, "grad_norm": 0.005957477726042271, "kl": 0.07054901123046875, "learning_rate": 3.2222222222222227e-06, "loss": 0.0353, "num_tokens": 20529402.0, "reward": 0.6486536264419556, "reward_std": 0.2027164101600647, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7663355469703674, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.2169092893600464, "step": 84 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.656754161331626, "calib/avg_num_step_conf": 5.0703125, "calib/ece": 0.12190476190476196, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.05952380952380952, "calib/gap": 0.08846862996158766, "calib/mean_conf": 0.6853968253968253, "calib/mu_c": 0.7240140845070423, "calib/mu_w": 0.6355454545454546, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12190476190476196, "calib/std_conf": 0.14734502643563752, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.41172610556348077, "calib/step_q_c_n": 701.0, "calib/step_q_gap": 0.04905776385493804, "calib/step_q_w": 0.36266834170854273, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2457.0, "completions/max_terminated_length": 2457.0, "completions/mean_length": 551.07421875, "completions/mean_terminated_length": 553.2353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.09066666666666667, "grad_norm": 0.006076201796531677, "kl": 0.07193756103515625, "learning_rate": 3.1944444444444443e-06, "loss": 0.0252, "num_tokens": 20778301.0, "reward": 0.6651642918586731, "reward_std": 0.20453515648841858, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7460108995437622, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.2772863209247589, "step": 85 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5978315210598676, "calib/avg_num_step_conf": 4.57421875, "calib/ece": 0.18383399209486168, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.043478260869565216, "calib/gap": 0.05948381452318463, "calib/mean_conf": 0.6745849802371541, "calib/mu_c": 0.7044444444444447, "calib/mu_w": 0.64496062992126, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.18019762845849802, "calib/std_conf": 0.15813239840899346, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4352115059221659, "calib/step_q_c_n": 591.0, "calib/step_q_gap": 0.022832195577338288, "calib/step_q_w": 0.4123793103448276, "calib/step_q_w_n": 580.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2520.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 555.59765625, "completions/mean_terminated_length": 555.59765625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.09173333333333333, "grad_norm": 0.005948134697973728, "kl": 0.0674591064453125, "learning_rate": 3.1666666666666667e-06, "loss": 0.0117, "num_tokens": 21026046.0, "reward": 0.6285470724105835, "reward_std": 0.1985204517841339, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7081304788589478, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.2552136182785034, "step": 86 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6512233622730861, "calib/avg_num_step_conf": 4.44921875, "calib/ece": 0.08999999999999994, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.035856573705179286, "calib/gap": 0.058494869771112734, "calib/mean_conf": 0.6917529880478087, "calib/mu_c": 0.7080662983425413, "calib/mu_w": 0.6495714285714286, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.030318725099601596, "calib/std_conf": 0.11360146815410785, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.46357231149567363, "calib/step_q_c_n": 809.0, "calib/step_q_gap": 0.04696625088961304, "calib/step_q_w": 0.4166060606060606, "calib/step_q_w_n": 330.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2547.0, "completions/max_terminated_length": 2547.0, "completions/mean_length": 486.36328125, "completions/mean_terminated_length": 488.2706298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0928, "grad_norm": 0.006253804080188274, "kl": 0.07823944091796875, "learning_rate": 3.138888888888889e-06, "loss": 0.0307, "num_tokens": 21256051.0, "reward": 0.7405951023101807, "reward_std": 0.18091410398483276, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7928581833839417, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.35083192586898804, "step": 87 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6947640966628309, "calib/avg_num_step_conf": 4.66015625, "calib/ece": 0.08686274509803911, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.03137254901960784, "calib/gap": 0.0777927215189873, "calib/mean_conf": 0.7002745098039216, "calib/mu_c": 0.724375, "calib/mu_w": 0.6465822784810127, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.048470588235294064, "calib/std_conf": 0.11513471707413059, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4650488997555012, "calib/step_q_c_n": 818.0, "calib/step_q_gap": 0.021502233088834533, "calib/step_q_w": 0.44354666666666664, "calib/step_q_w_n": 375.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1178.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 500.49609375, "completions/mean_terminated_length": 502.4588623046875, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.09386666666666667, "grad_norm": 0.006577801890671253, "kl": 0.0890960693359375, "learning_rate": 3.1111111111111116e-06, "loss": 0.0028, "num_tokens": 21494026.0, "reward": 0.7101141810417175, "reward_std": 0.19201169908046722, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.802936315536499, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.2805732488632202, "step": 88 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7023994047988096, "calib/avg_num_step_conf": 4.546875, "calib/ece": 0.1671653543307087, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.03543307086614173, "calib/gap": 0.10110236220472457, "calib/mean_conf": 0.652992125984252, "calib/mu_c": 0.7035433070866142, "calib/mu_w": 0.6024409448818896, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16007874015748033, "calib/std_conf": 0.1607159912991228, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4362673611111111, "calib/step_q_c_n": 576.0, "calib/step_q_gap": 0.06159049036281178, "calib/step_q_w": 0.3746768707482993, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1747.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 556.70703125, "completions/mean_terminated_length": 558.8902587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.09493333333333333, "grad_norm": 0.005704918876290321, "kl": 0.07068634033203125, "learning_rate": 3.0833333333333336e-06, "loss": 0.0023, "num_tokens": 21745431.0, "reward": 0.6401622295379639, "reward_std": 0.18493416905403137, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7454453110694885, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.2372228503227234, "step": 89 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6815751445086705, "calib/avg_num_step_conf": 5.15625, "calib/ece": 0.08272727272727273, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.05533596837944664, "calib/gap": 0.09759537572254329, "calib/mean_conf": 0.6687351778656127, "calib/mu_c": 0.6995953757225434, "calib/mu_w": 0.6020000000000001, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.033833992094861674, "calib/std_conf": 0.16450903427883745, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.41314831460674156, "calib/step_q_c_n": 890.0, "calib/step_q_gap": 0.0036599425137182617, "calib/step_q_w": 0.4094883720930233, "calib/step_q_w_n": 430.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2540.0, "completions/max_terminated_length": 2540.0, "completions/mean_length": 503.7109375, "completions/mean_terminated_length": 505.6863098144531, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.096, "grad_norm": 0.005787891335785389, "kl": 0.07630157470703125, "learning_rate": 3.055555555555556e-06, "loss": 0.0017, "num_tokens": 21977701.0, "reward": 0.7060627937316895, "reward_std": 0.18334735929965973, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7868348360061646, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.29325956106185913, "step": 90 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7528893780957622, "calib/avg_num_step_conf": 4.8046875, "calib/ece": 0.09371999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.036, "calib/gap": 0.10940423775454045, "calib/mean_conf": 0.6686, "calib/mu_c": 0.7088607594936709, "calib/mu_w": 0.5994565217391304, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.06516, "calib/std_conf": 0.13455125417475675, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.43705792682926836, "calib/step_q_c_n": 656.0, "calib/step_q_gap": 0.043469076655052286, "calib/step_q_w": 0.3935888501742161, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1873.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 541.3828125, "completions/mean_terminated_length": 543.5059204101562, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.09706666666666666, "grad_norm": 0.005156281869858503, "kl": 0.0768280029296875, "learning_rate": 3.0277777777777776e-06, "loss": 0.0131, "num_tokens": 22224007.0, "reward": 0.661525309085846, "reward_std": 0.19307063519954681, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7801464796066284, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.2241540551185608, "step": 91 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6775775245163, "calib/avg_num_step_conf": 4.24609375, "calib/ece": 0.0763095238095238, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.06746031746031746, "calib/gap": 0.10284786641929489, "calib/mean_conf": 0.6794841269841269, "calib/mu_c": 0.7194805194805194, "calib/mu_w": 0.6166326530612245, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.07234126984126983, "calib/std_conf": 0.15808166636302534, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4584123711340206, "calib/step_q_c_n": 679.0, "calib/step_q_gap": 0.056303057408530366, "calib/step_q_w": 0.40210931372549025, "calib/step_q_w_n": 408.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2203.0, "completions/max_terminated_length": 2203.0, "completions/mean_length": 456.45703125, "completions/mean_terminated_length": 458.2471008300781, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.09813333333333334, "grad_norm": 0.006089536473155022, "kl": 0.0848388671875, "learning_rate": 3e-06, "loss": 0.0016, "num_tokens": 22447580.0, "reward": 0.6949906349182129, "reward_std": 0.2117110639810562, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7602003812789917, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.3141559362411499, "step": 92 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6703247336670518, "calib/avg_num_step_conf": 5.21875, "calib/ece": 0.12201581027667986, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.09090909090909091, "calib/gap": 0.09191503016300873, "calib/mean_conf": 0.6801976284584981, "calib/mu_c": 0.7187074829931972, "calib/mu_w": 0.6267924528301885, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11059288537549408, "calib/std_conf": 0.1633746164213354, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4623680649526387, "calib/step_q_c_n": 739.0, "calib/step_q_gap": 0.02143004150205241, "calib/step_q_w": 0.4409380234505863, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2861.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 547.13671875, "completions/mean_terminated_length": 549.2824096679688, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.0992, "grad_norm": 0.006107079330831766, "kl": 0.07598114013671875, "learning_rate": 2.9722222222222225e-06, "loss": 0.0411, "num_tokens": 22693423.0, "reward": 0.6424725651741028, "reward_std": 0.2015823870897293, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7558277249336243, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.21661736071109772, "step": 93 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7461220597584233, "calib/avg_num_step_conf": 4.41015625, "calib/ece": 0.1146640316205533, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.07509881422924901, "calib/gap": 0.14400699300699316, "calib/mean_conf": 0.6776679841897233, "calib/mu_c": 0.7402797202797203, "calib/mu_w": 0.5962727272727272, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11355731225296437, "calib/std_conf": 0.17314715474512357, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.46685126582278486, "calib/step_q_c_n": 632.0, "calib/step_q_gap": 0.06349110485699011, "calib/step_q_w": 0.40336016096579475, "calib/step_q_w_n": 497.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 474.046875, "completions/mean_terminated_length": 475.9059143066406, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.10026666666666667, "grad_norm": 0.006255537271499634, "kl": 0.07927703857421875, "learning_rate": 2.944444444444445e-06, "loss": 0.0006, "num_tokens": 22923459.0, "reward": 0.6437188386917114, "reward_std": 0.19185274839401245, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7732378840446472, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.2048247754573822, "step": 94 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7481814291827129, "calib/avg_num_step_conf": 5.01953125, "calib/ece": 0.08197628458498027, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.08300395256916997, "calib/gap": 0.1519940094137785, "calib/mean_conf": 0.6468774703557312, "calib/mu_c": 0.6961403508771931, "calib/mu_w": 0.5441463414634146, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.026482213438735164, "calib/std_conf": 0.1890315781970204, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4154257641921397, "calib/step_q_c_n": 916.0, "calib/step_q_gap": -0.004818138246884651, "calib/step_q_w": 0.4202439024390244, "calib/step_q_w_n": 369.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2495.0, "completions/max_terminated_length": 2495.0, "completions/mean_length": 528.4921875, "completions/mean_terminated_length": 530.5647583007812, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.10133333333333333, "grad_norm": 0.005513823591172695, "kl": 0.07430267333984375, "learning_rate": 2.916666666666667e-06, "loss": -0.0091, "num_tokens": 23164881.0, "reward": 0.7214862108230591, "reward_std": 0.1887528896331787, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.801451563835144, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.31027084589004517, "step": 95 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7872743391360413, "calib/avg_num_step_conf": 4.7265625, "calib/ece": 0.06224409448818897, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.14173228346456693, "calib/gap": 0.20052546744036082, "calib/mean_conf": 0.7112992125984252, "calib/mu_c": 0.7634042553191488, "calib/mu_w": 0.562878787878788, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.016692913385826784, "calib/std_conf": 0.18306093021889827, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45240015816528273, "calib/step_q_c_n": 843.0, "calib/step_q_gap": 0.11373530802904297, "calib/step_q_w": 0.33866485013623976, "calib/step_q_w_n": 367.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2078.0, "completions/max_terminated_length": 2078.0, "completions/mean_length": 483.61328125, "completions/mean_terminated_length": 483.61328125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.1024, "grad_norm": 0.005868226755410433, "kl": 0.08411407470703125, "learning_rate": 2.888888888888889e-06, "loss": -0.0118, "num_tokens": 23394502.0, "reward": 0.6806752681732178, "reward_std": 0.17523370683193207, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.8438191413879395, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.1722189486026764, "step": 96 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6550604423868314, "calib/avg_num_step_conf": 4.85546875, "calib/ece": 0.13718253968253968, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.07936507936507936, "calib/gap": 0.07474537037037043, "calib/mean_conf": 0.6506746031746031, "calib/mu_c": 0.6827083333333334, "calib/mu_w": 0.6079629629629629, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10821428571428574, "calib/std_conf": 0.18980551154311645, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4440243902439024, "calib/step_q_c_n": 656.0, "calib/step_q_gap": 0.07150309552499268, "calib/step_q_w": 0.3725212947189097, "calib/step_q_w_n": 587.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2937.0, "completions/max_terminated_length": 2937.0, "completions/mean_length": 488.90625, "completions/mean_terminated_length": 492.7558898925781, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.10346666666666667, "grad_norm": 0.0057335710152983665, "kl": 0.08429718017578125, "learning_rate": 2.861111111111111e-06, "loss": 0.0263, "num_tokens": 23624734.0, "reward": 0.6633927822113037, "reward_std": 0.23494769632816315, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7351964712142944, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.2829952538013458, "step": 97 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6662457912457912, "calib/avg_num_step_conf": 4.48046875, "calib/ece": 0.1360557768924303, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.14342629482071714, "calib/gap": 0.1102907277907279, "calib/mean_conf": 0.7010756972111555, "calib/mu_c": 0.7485314685314687, "calib/mu_w": 0.6382407407407408, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13370517928286857, "calib/std_conf": 0.18136720210395232, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.43551724137931036, "calib/step_q_c_n": 609.0, "calib/step_q_gap": 0.06094475067299071, "calib/step_q_w": 0.37457249070631965, "calib/step_q_w_n": 538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2784.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 521.59765625, "completions/mean_terminated_length": 525.7047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.10453333333333334, "grad_norm": 0.0058539328165352345, "kl": 0.07634735107421875, "learning_rate": 2.8333333333333335e-06, "loss": 0.005, "num_tokens": 23864447.0, "reward": 0.6528790593147278, "reward_std": 0.2209436148405075, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7439660429954529, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.25397956371307373, "step": 98 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6324419285574858, "calib/avg_num_step_conf": 4.8125, "calib/ece": 0.22012, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.108, "calib/gap": 0.09897911380050772, "calib/mean_conf": 0.62124, "calib/mu_c": 0.6770642201834863, "calib/mu_w": 0.5780851063829786, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20268, "calib/std_conf": 0.23681144904754922, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4099226305609284, "calib/step_q_c_n": 517.0, "calib/step_q_gap": 0.06525130188959977, "calib/step_q_w": 0.34467132867132866, "calib/step_q_w_n": 715.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2261.0, "completions/max_terminated_length": 2261.0, "completions/mean_length": 590.921875, "completions/mean_terminated_length": 597.9288940429688, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.1056, "grad_norm": 0.005409719422459602, "kl": 0.065093994140625, "learning_rate": 2.805555555555556e-06, "loss": -0.0199, "num_tokens": 24121523.0, "reward": 0.5566372871398926, "reward_std": 0.23340797424316406, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.695084810256958, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.13850226998329163, "step": 99 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7833354006450013, "calib/avg_num_step_conf": 5.05078125, "calib/ece": 0.1381176470588235, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.1568627450980392, "calib/gap": 0.2238284544777971, "calib/mean_conf": 0.6727843137254902, "calib/mu_c": 0.7746043165467627, "calib/mu_w": 0.5507758620689656, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1329019607843137, "calib/std_conf": 0.231705722231293, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.388548051948052, "calib/step_q_c_n": 770.0, "calib/step_q_gap": 0.051148434357229766, "calib/step_q_w": 0.3373996175908222, "calib/step_q_w_n": 523.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1983.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 561.5, "completions/mean_terminated_length": 561.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.10666666666666667, "grad_norm": 0.005751878954470158, "kl": 0.071319580078125, "learning_rate": 2.7777777777777783e-06, "loss": -0.0012, "num_tokens": 24372675.0, "reward": 0.6705541610717773, "reward_std": 0.19056624174118042, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7899484038352966, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.2433473914861679, "step": 100 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.67578125, "calib/avg_num_step_conf": 5.41796875, "calib/ece": 0.159484126984127, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.1349206349206349, "calib/gap": 0.13800403225806446, "calib/mean_conf": 0.641468253968254, "calib/mu_c": 0.709375, "calib/mu_w": 0.5713709677419355, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14650793650793653, "calib/std_conf": 0.22893955626051365, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38006134969325156, "calib/step_q_c_n": 652.0, "calib/step_q_gap": 0.06513617962522439, "calib/step_q_w": 0.31492517006802717, "calib/step_q_w_n": 735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2913.0, "completions/max_terminated_length": 2913.0, "completions/mean_length": 568.0859375, "completions/mean_terminated_length": 568.0859375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.10773333333333333, "grad_norm": 0.005723032634705305, "kl": 0.0717315673828125, "learning_rate": 2.7500000000000004e-06, "loss": 0.0576, "num_tokens": 24625097.0, "reward": 0.6526573896408081, "reward_std": 0.21372900903224945, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.73505699634552, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.2741639316082001, "step": 101 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6968745870225981, "calib/avg_num_step_conf": 4.85546875, "calib/ece": 0.10972549019607844, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.17254901960784313, "calib/gap": 0.15897185146028792, "calib/mean_conf": 0.6816470588235294, "calib/mu_c": 0.7402484472049689, "calib/mu_w": 0.5812765957446809, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.08000000000000002, "calib/std_conf": 0.22704329927626454, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3889878542510122, "calib/step_q_c_n": 741.0, "calib/step_q_gap": 0.02321096182073329, "calib/step_q_w": 0.3657768924302789, "calib/step_q_w_n": 502.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 454.36328125, "completions/mean_terminated_length": 456.1451110839844, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.1088, "grad_norm": 0.00621482077986002, "kl": 0.08046722412109375, "learning_rate": 2.7222222222222224e-06, "loss": -0.0413, "num_tokens": 24848110.0, "reward": 0.6459211707115173, "reward_std": 0.18273219466209412, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7812562584877014, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.18636733293533325, "step": 102 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.743279569892473, "calib/avg_num_step_conf": 4.796875, "calib/ece": 0.05972332015810276, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2015810276679842, "calib/gap": 0.2100994623655914, "calib/mean_conf": 0.6450197628458498, "calib/mu_c": 0.7222500000000001, "calib/mu_w": 0.5121505376344087, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.03616600790513833, "calib/std_conf": 0.25069488329564227, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3800127551020408, "calib/step_q_c_n": 784.0, "calib/step_q_gap": 0.016251493840779563, "calib/step_q_w": 0.36376126126126124, "calib/step_q_w_n": 444.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2345.0, "completions/max_terminated_length": 2345.0, "completions/mean_length": 573.76953125, "completions/mean_terminated_length": 573.76953125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.10986666666666667, "grad_norm": 0.00558763463050127, "kl": 0.07482147216796875, "learning_rate": 2.6944444444444444e-06, "loss": 0.0177, "num_tokens": 25099547.0, "reward": 0.6870755553245544, "reward_std": 0.16483908891677856, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7903547286987305, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.2619214355945587, "step": 103 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.699807369663829, "calib/avg_num_step_conf": 5.375, "calib/ece": 0.1439370078740157, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.1062992125984252, "calib/gap": 0.1560367861803268, "calib/mean_conf": 0.5893700787401575, "calib/mu_c": 0.6710743801652892, "calib/mu_w": 0.5150375939849624, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12846456692913383, "calib/std_conf": 0.24024838862847026, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42789137380191694, "calib/step_q_c_n": 626.0, "calib/step_q_gap": 0.06739804046858361, "calib/step_q_w": 0.36049333333333333, "calib/step_q_w_n": 750.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2718.0, "completions/max_terminated_length": 2718.0, "completions/mean_length": 541.4765625, "completions/mean_terminated_length": 543.6000366210938, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.11093333333333333, "grad_norm": 0.005712118931114674, "kl": 0.0785980224609375, "learning_rate": 2.666666666666667e-06, "loss": -0.0024, "num_tokens": 25344845.0, "reward": 0.6707723140716553, "reward_std": 0.19318893551826477, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7519944906234741, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.2965813875198364, "step": 104 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6725705329153606, "calib/avg_num_step_conf": 5.33984375, "calib/ece": 0.13341176470588229, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.20784313725490197, "calib/gap": 0.170163009404389, "calib/mean_conf": 0.6169411764705883, "calib/mu_c": 0.6903448275862071, "calib/mu_w": 0.5201818181818181, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09086274509803917, "calib/std_conf": 0.28378267104058635, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37041958041958045, "calib/step_q_c_n": 715.0, "calib/step_q_gap": 0.056830623364365784, "calib/step_q_w": 0.31358895705521467, "calib/step_q_w_n": 652.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2102.0, "completions/max_terminated_length": 2102.0, "completions/mean_length": 506.34765625, "completions/mean_terminated_length": 508.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.112, "grad_norm": 0.0058001065626740456, "kl": 0.07666015625, "learning_rate": 2.6388888888888893e-06, "loss": 0.0332, "num_tokens": 25580230.0, "reward": 0.6575684547424316, "reward_std": 0.20606115460395813, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.752371072769165, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.25026577711105347, "step": 105 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7404392275653162, "calib/avg_num_step_conf": 4.6484375, "calib/ece": 0.0896442687747035, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.17391304347826086, "calib/gap": 0.22103054398586397, "calib/mean_conf": 0.6245059288537549, "calib/mu_c": 0.7241007194244605, "calib/mu_w": 0.5030701754385966, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08237154150197624, "calib/std_conf": 0.25921333745136416, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3931710914454277, "calib/step_q_c_n": 678.0, "calib/step_q_gap": 0.02887421644542776, "calib/step_q_w": 0.36429687499999996, "calib/step_q_w_n": 512.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2005.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 490.4765625, "completions/mean_terminated_length": 490.4765625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.11306666666666666, "grad_norm": 0.00585206039249897, "kl": 0.07993316650390625, "learning_rate": 2.6111111111111113e-06, "loss": 0.0038, "num_tokens": 25810376.0, "reward": 0.6838876008987427, "reward_std": 0.18188489973545074, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.779799222946167, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.2817259728908539, "step": 106 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.587069372345446, "calib/avg_num_step_conf": 5.0546875, "calib/ece": 0.15594488188976385, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.19291338582677164, "calib/gap": 0.08754870895975198, "calib/mean_conf": 0.646732283464567, "calib/mu_c": 0.6780981595092026, "calib/mu_w": 0.5905494505494506, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08047244094488193, "calib/std_conf": 0.253453219800162, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.36181850117096026, "calib/step_q_c_n": 854.0, "calib/step_q_gap": -0.009590589738130706, "calib/step_q_w": 0.37140909090909097, "calib/step_q_w_n": 440.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 475.3515625, "completions/mean_terminated_length": 479.094482421875, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.11413333333333334, "grad_norm": 0.005947344470769167, "kl": 0.08448028564453125, "learning_rate": 2.5833333333333337e-06, "loss": -0.0205, "num_tokens": 26036682.0, "reward": 0.7219668030738831, "reward_std": 0.21111388504505157, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7402527332305908, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.3778996169567108, "step": 107 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6728257275132274, "calib/avg_num_step_conf": 5.0390625, "calib/ece": 0.1403162055335968, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2924901185770751, "calib/gap": 0.18115244708994716, "calib/mean_conf": 0.6612648221343873, "calib/mu_c": 0.7070899470899472, "calib/mu_w": 0.5259375000000001, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.027272727272727282, "calib/std_conf": 0.29326427300611696, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37370160427807486, "calib/step_q_c_n": 935.0, "calib/step_q_gap": 0.06415230850342696, "calib/step_q_w": 0.3095492957746479, "calib/step_q_w_n": 355.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2140.0, "completions/max_terminated_length": 2140.0, "completions/mean_length": 525.47265625, "completions/mean_terminated_length": 531.7035522460938, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.1152, "grad_norm": 0.006274311803281307, "kl": 0.0753326416015625, "learning_rate": 2.5555555555555557e-06, "loss": -0.0022, "num_tokens": 26274435.0, "reward": 0.7267597913742065, "reward_std": 0.17975559830665588, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7769195437431335, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.33128753304481506, "step": 108 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8266497775580821, "calib/avg_num_step_conf": 5.18359375, "calib/ece": 0.1487058823529412, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.21568627450980393, "calib/gap": 0.3182773109243698, "calib/mean_conf": 0.6082352941176471, "calib/mu_c": 0.7779831932773109, "calib/mu_w": 0.45970588235294113, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14513725490196078, "calib/std_conf": 0.2847007777860333, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39307692307692305, "calib/step_q_c_n": 559.0, "calib/step_q_gap": 0.07746494391025638, "calib/step_q_w": 0.31561197916666667, "calib/step_q_w_n": 768.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1808.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 520.66015625, "completions/mean_terminated_length": 522.7019653320312, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.11626666666666667, "grad_norm": 0.0058349259197711945, "kl": 0.08112335205078125, "learning_rate": 2.5277777777777778e-06, "loss": 0.0083, "num_tokens": 26512324.0, "reward": 0.6595449447631836, "reward_std": 0.1664503961801529, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.8052883148193359, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.22161419689655304, "step": 109 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6886036358897505, "calib/avg_num_step_conf": 4.5703125, "calib/ece": 0.13408730158730164, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.2619047619047619, "calib/gap": 0.18284681045155393, "calib/mean_conf": 0.6543253968253968, "calib/mu_c": 0.7290604026845637, "calib/mu_w": 0.5462135922330098, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09857142857142867, "calib/std_conf": 0.284922610750544, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3879663608562691, "calib/step_q_c_n": 654.0, "calib/step_q_gap": 0.057772562406656736, "calib/step_q_w": 0.3301937984496124, "calib/step_q_w_n": 516.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2256.0, "completions/max_terminated_length": 2256.0, "completions/mean_length": 477.84375, "completions/mean_terminated_length": 481.6062927246094, "completions/min_length": 0.0, "completions/min_terminated_length": 105.0, "epoch": 0.11733333333333333, "grad_norm": 0.006158989388495684, "kl": 0.085357666015625, "learning_rate": 2.5e-06, "loss": 0.0093, "num_tokens": 26739572.0, "reward": 0.685361385345459, "reward_std": 0.21978026628494263, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7471511363983154, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.3110716640949249, "step": 110 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7192771845580337, "calib/avg_num_step_conf": 4.50390625, "calib/ece": 0.17658730158730157, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.30952380952380953, "calib/gap": 0.219637328615657, "calib/mean_conf": 0.6507936507936508, "calib/mu_c": 0.7545112781954888, "calib/mu_w": 0.5348739495798318, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14980158730158727, "calib/std_conf": 0.30543126723913944, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3840105078809107, "calib/step_q_c_n": 571.0, "calib/step_q_gap": 0.03663937386029209, "calib/step_q_w": 0.3473711340206186, "calib/step_q_w_n": 582.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2691.0, "completions/max_terminated_length": 2691.0, "completions/mean_length": 554.46875, "completions/mean_terminated_length": 554.46875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.1184, "grad_norm": 0.0054856035858392715, "kl": 0.07257080078125, "learning_rate": 2.4722222222222226e-06, "loss": 0.0214, "num_tokens": 26988924.0, "reward": 0.6293383836746216, "reward_std": 0.21292832493782043, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.740082859992981, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.21781271696090698, "step": 111 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7825503355704698, "calib/avg_num_step_conf": 4.78515625, "calib/ece": 0.12527559055118107, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.21653543307086615, "calib/gap": 0.3289434324065196, "calib/mean_conf": 0.5626771653543307, "calib/mu_c": 0.6986577181208053, "calib/mu_w": 0.3697142857142858, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.050669291338582625, "calib/std_conf": 0.3290596212853663, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.376, "calib/step_q_c_n": 695.0, "calib/step_q_gap": 0.07784905660377361, "calib/step_q_w": 0.2981509433962264, "calib/step_q_w_n": 530.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2745.0, "completions/max_terminated_length": 2745.0, "completions/mean_length": 551.30078125, "completions/mean_terminated_length": 551.30078125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.11946666666666667, "grad_norm": 0.006195261608809233, "kl": 0.0721588134765625, "learning_rate": 2.4444444444444447e-06, "loss": 0.0561, "num_tokens": 27237977.0, "reward": 0.6678087711334229, "reward_std": 0.19492530822753906, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.8018710613250732, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.2189026176929474, "step": 112 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7812828947368422, "calib/avg_num_step_conf": 5.14453125, "calib/ece": 0.08226190476190479, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.31746031746031744, "calib/gap": 0.30880263157894733, "calib/mean_conf": 0.6672619047619048, "calib/mu_c": 0.7898026315789474, "calib/mu_w": 0.48100000000000004, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0731746031746032, "calib/std_conf": 0.3011262636237346, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3532605729877217, "calib/step_q_c_n": 733.0, "calib/step_q_gap": 0.04346605243977647, "calib/step_q_w": 0.3097945205479452, "calib/step_q_w_n": 584.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2323.0, "completions/max_terminated_length": 2323.0, "completions/mean_length": 497.7265625, "completions/mean_terminated_length": 499.678466796875, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.12053333333333334, "grad_norm": 0.00658282358199358, "kl": 0.0800018310546875, "learning_rate": 2.4166666666666667e-06, "loss": -0.0095, "num_tokens": 27470595.0, "reward": 0.7177433371543884, "reward_std": 0.21778149902820587, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.8009738326072693, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.31810659170150757, "step": 113 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7824581005586593, "calib/avg_num_step_conf": 5.0390625, "calib/ece": 0.09307086614173234, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3346456692913386, "calib/gap": 0.33646480446927385, "calib/mean_conf": 0.6803149606299213, "calib/mu_c": 0.7796648044692738, "calib/mu_w": 0.4431999999999999, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.034330708661417374, "calib/std_conf": 0.3222080244749067, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40392541176470587, "calib/step_q_c_n": 850.0, "calib/step_q_gap": 0.12774359358288767, "calib/step_q_w": 0.2761818181818182, "calib/step_q_w_n": 440.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1777.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 485.6484375, "completions/mean_terminated_length": 487.552978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.1216, "grad_norm": 0.8515591621398926, "kl": 4.70794677734375, "learning_rate": 2.388888888888889e-06, "loss": 0.0792, "num_tokens": 27699945.0, "reward": 0.7293667197227478, "reward_std": 0.1959434449672699, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.8210617303848267, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.2993904948234558, "step": 114 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6217948717948718, "calib/avg_num_step_conf": 4.625, "calib/ece": 0.1993307086614174, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.33858267716535434, "calib/gap": 0.1435635792778649, "calib/mean_conf": 0.6880708661417322, "calib/mu_c": 0.7434615384615384, "calib/mu_w": 0.5998979591836735, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1366141732283465, "calib/std_conf": 0.29743493117836134, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38166666666666665, "calib/step_q_c_n": 732.0, "calib/step_q_gap": 0.015538348082595854, "calib/step_q_w": 0.3661283185840708, "calib/step_q_w_n": 452.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2213.0, "completions/max_terminated_length": 2213.0, "completions/mean_length": 444.4609375, "completions/mean_terminated_length": 444.4609375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.12266666666666666, "grad_norm": 0.006345377303659916, "kl": 0.08205413818359375, "learning_rate": 2.361111111111111e-06, "loss": -0.0121, "num_tokens": 27918991.0, "reward": 0.6729586720466614, "reward_std": 0.22253727912902832, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7313871383666992, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.29343652725219727, "step": 115 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6342528591177579, "calib/avg_num_step_conf": 4.91796875, "calib/ece": 0.1986666666666666, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2980392156862745, "calib/gap": 0.16562272213145668, "calib/mean_conf": 0.6034509803921568, "calib/mu_c": 0.6742465753424658, "calib/mu_w": 0.5086238532110091, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.11478431372549017, "calib/std_conf": 0.3466295029321082, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.39010043041606884, "calib/step_q_c_n": 697.0, "calib/step_q_gap": 0.0578584375335065, "calib/step_q_w": 0.33224199288256234, "calib/step_q_w_n": 562.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1177.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 508.07421875, "completions/mean_terminated_length": 510.06671142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.12373333333333333, "grad_norm": 0.005793208722025156, "kl": 0.0783233642578125, "learning_rate": 2.3333333333333336e-06, "loss": 0.0129, "num_tokens": 28153578.0, "reward": 0.6464129686355591, "reward_std": 0.22290663421154022, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7034027576446533, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.2784857451915741, "step": 116 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7134639303482587, "calib/avg_num_step_conf": 5.171875, "calib/ece": 0.16669291338582673, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2992125984251969, "calib/gap": 0.22972388059701487, "calib/mean_conf": 0.6566929133858267, "calib/mu_c": 0.7652238805970148, "calib/mu_w": 0.5355, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1479133858267716, "calib/std_conf": 0.3043142675168991, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41185459940652813, "calib/step_q_c_n": 674.0, "calib/step_q_gap": 0.058885368637297375, "calib/step_q_w": 0.35296923076923076, "calib/step_q_w_n": 650.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2356.0, "completions/max_terminated_length": 2356.0, "completions/mean_length": 490.5, "completions/mean_terminated_length": 490.5, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.1248, "grad_norm": 0.005411152727901936, "kl": 0.0897064208984375, "learning_rate": 2.305555555555556e-06, "loss": 0.0368, "num_tokens": 28385746.0, "reward": 0.6755538582801819, "reward_std": 0.21486549079418182, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7500835657119751, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.29789912700653076, "step": 117 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7546408137317229, "calib/avg_num_step_conf": 5.94140625, "calib/ece": 0.1397233201581028, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3359683794466403, "calib/gap": 0.3244475524475524, "calib/mean_conf": 0.6468379446640315, "calib/mu_c": 0.7879020979020979, "calib/mu_w": 0.46345454545454545, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1106719367588933, "calib/std_conf": 0.3476224745399264, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.35203726708074534, "calib/step_q_c_n": 805.0, "calib/step_q_gap": 0.045550255907560955, "calib/step_q_w": 0.3064870111731844, "calib/step_q_w_n": 716.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2318.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 551.515625, "completions/mean_terminated_length": 555.8582763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.12586666666666665, "grad_norm": 0.005002293735742569, "kl": 0.08039093017578125, "learning_rate": 2.277777777777778e-06, "loss": 0.0024, "num_tokens": 28630942.0, "reward": 0.6808527708053589, "reward_std": 0.1966305822134018, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7770004272460938, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.27533018589019775, "step": 118 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7247474747474748, "calib/avg_num_step_conf": 4.94921875, "calib/ece": 0.1542913385826772, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3031496062992126, "calib/gap": 0.2739583333333333, "calib/mean_conf": 0.5953149606299212, "calib/mu_c": 0.7139583333333334, "calib/mu_w": 0.44000000000000006, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09133858267716542, "calib/std_conf": 0.3456850143501426, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41453235294117646, "calib/step_q_c_n": 680.0, "calib/step_q_gap": 0.046230251862244054, "calib/step_q_w": 0.3683021010789324, "calib/step_q_w_n": 587.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2349.0, "completions/max_terminated_length": 2349.0, "completions/mean_length": 551.5078125, "completions/mean_terminated_length": 553.6705932617188, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.12693333333333334, "grad_norm": 0.005192252341657877, "kl": 0.08580780029296875, "learning_rate": 2.25e-06, "loss": 0.0301, "num_tokens": 28877192.0, "reward": 0.6779056191444397, "reward_std": 0.20860622823238373, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7626948952674866, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.2821788191795349, "step": 119 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7398603723404256, "calib/avg_num_step_conf": 5.15234375, "calib/ece": 0.1735826771653544, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2559055118110236, "calib/gap": 0.30309840425531914, "calib/mean_conf": 0.5672047244094488, "calib/mu_c": 0.679375, "calib/mu_w": 0.3762765957446808, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.05543307086614178, "calib/std_conf": 0.35620879567946134, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.41292326431181486, "calib/step_q_c_n": 821.0, "calib/step_q_gap": 0.047943344633100016, "calib/step_q_w": 0.36497991967871485, "calib/step_q_w_n": 498.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 495.671875, "completions/mean_terminated_length": 495.671875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.128, "grad_norm": 0.005819946061819792, "kl": 0.0976715087890625, "learning_rate": 2.222222222222222e-06, "loss": 0.0498, "num_tokens": 29110772.0, "reward": 0.709846019744873, "reward_std": 0.21084654331207275, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7673988342285156, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.32963696122169495, "step": 120 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6623596891575182, "calib/avg_num_step_conf": 5.34375, "calib/ece": 0.1974117647058823, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.19373072653262613, "calib/mean_conf": 0.6141176470588235, "calib/mu_c": 0.706044776119403, "calib/mu_w": 0.5123140495867768, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1430196078431372, "calib/std_conf": 0.33535568750143735, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4042550847457627, "calib/step_q_c_n": 708.0, "calib/step_q_gap": 0.007709630200308126, "calib/step_q_w": 0.3965454545454546, "calib/step_q_w_n": 660.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2152.0, "completions/max_terminated_length": 2152.0, "completions/mean_length": 549.84375, "completions/mean_terminated_length": 552.0000610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.12906666666666666, "grad_norm": 0.004886144772171974, "kl": 0.09521484375, "learning_rate": 2.1944444444444445e-06, "loss": -0.0044, "num_tokens": 29356588.0, "reward": 0.6371598243713379, "reward_std": 0.26278766989707947, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7241054773330688, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.24630790948867798, "step": 121 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7403474903474905, "calib/avg_num_step_conf": 4.78515625, "calib/ece": 0.13984063745019926, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.2868525896414343, "calib/gap": 0.31731209781209774, "calib/mean_conf": 0.6520318725099602, "calib/mu_c": 0.7923571428571429, "calib/mu_w": 0.47504504504504513, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1170517928286853, "calib/std_conf": 0.33847833703431246, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43339848484848487, "calib/step_q_c_n": 660.0, "calib/step_q_gap": 0.08638963529096272, "calib/step_q_w": 0.34700884955752215, "calib/step_q_w_n": 565.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2206.0, "completions/max_terminated_length": 2206.0, "completions/mean_length": 516.546875, "completions/mean_terminated_length": 518.5725708007812, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.13013333333333332, "grad_norm": 0.004893404431641102, "kl": 0.10149383544921875, "learning_rate": 2.166666666666667e-06, "loss": -0.0062, "num_tokens": 29596168.0, "reward": 0.6806086301803589, "reward_std": 0.2333202213048935, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7710624933242798, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.2846860885620117, "step": 122 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5823994935106047, "calib/avg_num_step_conf": 5.0703125, "calib/ece": 0.2592857142857143, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2698412698412698, "calib/gap": 0.10882051282051297, "calib/mean_conf": 0.6021428571428572, "calib/mu_c": 0.6526666666666667, "calib/mu_w": 0.5438461538461538, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16285714285714287, "calib/std_conf": 0.34300864708165757, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44494976377952755, "calib/step_q_c_n": 635.0, "calib/step_q_gap": 0.07440345910381113, "calib/step_q_w": 0.3705463046757164, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2783.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 577.77734375, "completions/mean_terminated_length": 580.0431518554688, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.1312, "grad_norm": 0.004810995887964964, "kl": 0.0954742431640625, "learning_rate": 2.138888888888889e-06, "loss": 0.0226, "num_tokens": 29849367.0, "reward": 0.5776532888412476, "reward_std": 0.2678280472755432, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6648507714271545, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.1896745264530182, "step": 123 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6507565789473684, "calib/avg_num_step_conf": 4.99609375, "calib/ece": 0.19662745098039225, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2784313725490196, "calib/gap": 0.1785657894736843, "calib/mean_conf": 0.6257254901960785, "calib/mu_c": 0.69225, "calib/mu_w": 0.5136842105263157, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09745098039215695, "calib/std_conf": 0.33533384468237853, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44340931677018636, "calib/step_q_c_n": 805.0, "calib/step_q_gap": 0.010309316770186372, "calib/step_q_w": 0.4331, "calib/step_q_w_n": 474.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 510.94140625, "completions/mean_terminated_length": 512.9451293945312, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.13226666666666667, "grad_norm": 0.004809759557247162, "kl": 0.1109619140625, "learning_rate": 2.1111111111111114e-06, "loss": -0.0343, "num_tokens": 30086984.0, "reward": 0.6774014234542847, "reward_std": 0.2310647964477539, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7343937158584595, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.29619038105010986, "step": 124 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5847081999507511, "calib/avg_num_step_conf": 5.09765625, "calib/ece": 0.2771372549019608, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2784313725490196, "calib/gap": 0.10810268406796364, "calib/mean_conf": 0.6024705882352941, "calib/mu_c": 0.6550381679389313, "calib/mu_w": 0.5469354838709677, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18294117647058825, "calib/std_conf": 0.3478254212169548, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42561712439418414, "calib/step_q_c_n": 619.0, "calib/step_q_gap": 0.05229015646415497, "calib/step_q_w": 0.3733269679300292, "calib/step_q_w_n": 686.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 538.71484375, "completions/mean_terminated_length": 540.8274536132812, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.13333333333333333, "grad_norm": 0.00505201518535614, "kl": 0.10394287109375, "learning_rate": 2.0833333333333334e-06, "loss": -0.0273, "num_tokens": 30329703.0, "reward": 0.5742524862289429, "reward_std": 0.24623748660087585, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6717531681060791, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.17597052454948425, "step": 125 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6400856638951877, "calib/avg_num_step_conf": 5.47265625, "calib/ece": 0.2600210317460318, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.31746031746031744, "calib/gap": 0.17258174603174614, "calib/mean_conf": 0.6055345238095238, "calib/mu_c": 0.6918253968253968, "calib/mu_w": 0.5192436507936506, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18277777777777784, "calib/std_conf": 0.3660238688636956, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4356067880794702, "calib/step_q_c_n": 604.0, "calib/step_q_gap": 0.09701318707570605, "calib/step_q_w": 0.3385936010037641, "calib/step_q_w_n": 797.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2477.0, "completions/max_terminated_length": 2477.0, "completions/mean_length": 528.65625, "completions/mean_terminated_length": 532.8189086914062, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.1344, "grad_norm": 0.004750107880681753, "kl": 0.0997772216796875, "learning_rate": 2.0555555555555555e-06, "loss": -0.0065, "num_tokens": 30570503.0, "reward": 0.5669196248054504, "reward_std": 0.23805102705955505, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6774504780769348, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.16185757517814636, "step": 126 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6682522681451614, "calib/avg_num_step_conf": 5.37890625, "calib/ece": 0.21242063492063484, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.2777777777777778, "calib/gap": 0.22329637096774185, "calib/mean_conf": 0.6054365079365079, "calib/mu_c": 0.7153125, "calib/mu_w": 0.49201612903225816, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1549603174603174, "calib/std_conf": 0.35511373085502307, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.41818784194528874, "calib/step_q_c_n": 658.0, "calib/step_q_gap": 0.04134959437922481, "calib/step_q_w": 0.37683824756606393, "calib/step_q_w_n": 719.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2279.0, "completions/max_terminated_length": 2279.0, "completions/mean_length": 498.65625, "completions/mean_terminated_length": 502.5826721191406, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.13546666666666668, "grad_norm": 0.006005220580846071, "kl": 0.115814208984375, "learning_rate": 2.027777777777778e-06, "loss": 0.0036, "num_tokens": 30801831.0, "reward": 0.5964082479476929, "reward_std": 0.22729472815990448, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.7108261585235596, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.1858965903520584, "step": 127 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.636987619475271, "calib/avg_num_step_conf": 4.7265625, "calib/ece": 0.21941480000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.252, "calib/gap": 0.18101319520174475, "calib/mean_conf": 0.5682651999999999, "calib/mu_c": 0.6544274809160305, "calib/mu_w": 0.4734142857142858, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13184, "calib/std_conf": 0.36506893216070857, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43656195462478187, "calib/step_q_c_n": 573.0, "calib/step_q_gap": 0.11015112259966414, "calib/step_q_w": 0.3264108320251177, "calib/step_q_w_n": 637.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2157.0, "completions/max_terminated_length": 2157.0, "completions/mean_length": 511.12890625, "completions/mean_terminated_length": 519.2420654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.13653333333333334, "grad_norm": 0.005148904863744974, "kl": 0.119293212890625, "learning_rate": 2.0000000000000003e-06, "loss": -0.0198, "num_tokens": 31039344.0, "reward": 0.571398138999939, "reward_std": 0.23523159325122833, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.689100980758667, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.15603910386562347, "step": 128 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5945349952061362, "calib/avg_num_step_conf": 4.96875, "calib/ece": 0.27204724409448816, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3031496062992126, "calib/gap": 0.0967350591243209, "calib/mean_conf": 0.6659842519685041, "calib/mu_c": 0.7059731543624161, "calib/mu_w": 0.6092380952380952, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17570866141732283, "calib/std_conf": 0.32659001687448785, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43590402075226975, "calib/step_q_c_n": 771.0, "calib/step_q_gap": -0.014559651902420867, "calib/step_q_w": 0.4504636726546906, "calib/step_q_w_n": 501.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1996.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 461.68359375, "completions/mean_terminated_length": 461.68359375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.1376, "grad_norm": 0.005383655428886414, "kl": 0.1225128173828125, "learning_rate": 1.9722222222222224e-06, "loss": 0.0319, "num_tokens": 31259919.0, "reward": 0.6519710421562195, "reward_std": 0.24818632006645203, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6860554814338684, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.30304285883903503, "step": 129 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7220584381551363, "calib/avg_num_step_conf": 4.8671875, "calib/ece": 0.13090196078431376, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3058823529411765, "calib/gap": 0.26196737421383637, "calib/mean_conf": 0.6494901960784314, "calib/mu_c": 0.7481132075471697, "calib/mu_w": 0.48614583333333333, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07843137254901965, "calib/std_conf": 0.33223446555100444, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46137522768670314, "calib/step_q_c_n": 732.0, "calib/step_q_gap": 0.050129702394874354, "calib/step_q_w": 0.4112455252918288, "calib/step_q_w_n": 514.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2370.0, "completions/max_terminated_length": 2370.0, "completions/mean_length": 479.08984375, "completions/mean_terminated_length": 479.08984375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.13866666666666666, "grad_norm": 0.005056967493146658, "kl": 0.123077392578125, "learning_rate": 1.944444444444445e-06, "loss": 0.002, "num_tokens": 31487854.0, "reward": 0.6769087314605713, "reward_std": 0.18351736664772034, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7702523469924927, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.2609088718891144, "step": 130 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6623878364905285, "calib/avg_num_step_conf": 4.953125, "calib/ece": 0.22767716535433072, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.1968503937007874, "calib/gap": 0.19071660019940173, "calib/mean_conf": 0.5394094488188976, "calib/mu_c": 0.6415254237288135, "calib/mu_w": 0.45080882352941176, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15125984251968505, "calib/std_conf": 0.34153204745423066, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4148675721561969, "calib/step_q_c_n": 589.0, "calib/step_q_gap": 0.016053117878910728, "calib/step_q_w": 0.3988144542772862, "calib/step_q_w_n": 678.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2269.0, "completions/max_terminated_length": 2269.0, "completions/mean_length": 481.52734375, "completions/mean_terminated_length": 481.52734375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.13973333333333332, "grad_norm": 0.005481308791786432, "kl": 0.1237945556640625, "learning_rate": 1.916666666666667e-06, "loss": 0.0127, "num_tokens": 31717333.0, "reward": 0.6147300601005554, "reward_std": 0.19411855936050415, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7143527269363403, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.22526362538337708, "step": 131 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7591631423811354, "calib/avg_num_step_conf": 4.703125, "calib/ece": 0.14913725490196073, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.28627450980392155, "calib/gap": 0.32385620915032676, "calib/mean_conf": 0.6092156862745098, "calib/mu_c": 0.7387581699346405, "calib/mu_w": 0.41490196078431374, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.07917647058823526, "calib/std_conf": 0.3581326467876354, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.447086889818689, "calib/step_q_c_n": 717.0, "calib/step_q_gap": 0.11053658181047543, "calib/step_q_w": 0.33655030800821356, "calib/step_q_w_n": 487.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1378.0, "completions/max_terminated_length": 1378.0, "completions/mean_length": 499.546875, "completions/mean_terminated_length": 501.50592041015625, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.1408, "grad_norm": 0.005307801067829132, "kl": 0.12115478515625, "learning_rate": 1.888888888888889e-06, "loss": 0.0038, "num_tokens": 31950809.0, "reward": 0.676338791847229, "reward_std": 0.23184099793434143, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.783726155757904, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.25098270177841187, "step": 132 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6145109395109395, "calib/avg_num_step_conf": 4.859375, "calib/ece": 0.28905138339920944, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2648221343873518, "calib/gap": 0.15012612612612608, "calib/mean_conf": 0.5928458498023715, "calib/mu_c": 0.6806666666666666, "calib/mu_w": 0.5305405405405406, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23343873517786556, "calib/std_conf": 0.3546771119033089, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42118825910931174, "calib/step_q_c_n": 494.0, "calib/step_q_gap": 0.05738959244264508, "calib/step_q_w": 0.36379866666666666, "calib/step_q_w_n": 750.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1809.0, "completions/max_terminated_length": 1809.0, "completions/mean_length": 552.0078125, "completions/mean_terminated_length": 556.3543090820312, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.14186666666666667, "grad_norm": 0.0047843498177826405, "kl": 0.1084136962890625, "learning_rate": 1.8611111111111113e-06, "loss": -0.0054, "num_tokens": 32198467.0, "reward": 0.5920966267585754, "reward_std": 0.2622288465499878, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.6648152470588684, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.23969054222106934, "step": 133 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.774765478424015, "calib/avg_num_step_conf": 4.70703125, "calib/ece": 0.12474308300395254, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.25691699604743085, "calib/gap": 0.3447091932457787, "calib/mean_conf": 0.5839525691699605, "calib/mu_c": 0.7515384615384616, "calib/mu_w": 0.4068292682926829, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09743083003952566, "calib/std_conf": 0.35480578348636826, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.410953947368421, "calib/step_q_c_n": 608.0, "calib/step_q_gap": 0.057947582209292026, "calib/step_q_w": 0.353006365159129, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1944.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 544.9453125, "completions/mean_terminated_length": 549.2362060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.14293333333333333, "grad_norm": 0.004672987386584282, "kl": 0.1106719970703125, "learning_rate": 1.8333333333333333e-06, "loss": 0.0134, "num_tokens": 32446925.0, "reward": 0.6539692282676697, "reward_std": 0.23669315874576569, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7823336124420166, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.22638607025146484, "step": 134 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6206156716417911, "calib/avg_num_step_conf": 4.6328125, "calib/ece": 0.24350393700787404, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2952755905511811, "calib/gap": 0.14422388059701496, "calib/mean_conf": 0.6179133858267716, "calib/mu_c": 0.6940000000000001, "calib/mu_w": 0.5497761194029851, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19448818897637796, "calib/std_conf": 0.33028974451628174, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42056243781094527, "calib/step_q_c_n": 536.0, "calib/step_q_gap": 0.04271284806735548, "calib/step_q_w": 0.3778495897435898, "calib/step_q_w_n": 650.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2836.0, "completions/max_terminated_length": 2836.0, "completions/mean_length": 548.71484375, "completions/mean_terminated_length": 548.71484375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.144, "grad_norm": 0.004846002906560898, "kl": 0.110809326171875, "learning_rate": 1.8055555555555557e-06, "loss": 0.0157, "num_tokens": 32693276.0, "reward": 0.6306657195091248, "reward_std": 0.2516302466392517, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.686989426612854, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.2829357087612152, "step": 135 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7482587064676616, "calib/avg_num_step_conf": 5.4609375, "calib/ece": 0.16629921259842523, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2637795275590551, "calib/gap": 0.3096194029850746, "calib/mean_conf": 0.5501574803149607, "calib/mu_c": 0.7135, "calib/mu_w": 0.4038805970149254, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1220078740157481, "calib/std_conf": 0.3632345595874525, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.39179575163398694, "calib/step_q_c_n": 612.0, "calib/step_q_gap": 0.036409746544928456, "calib/step_q_w": 0.3553860050890585, "calib/step_q_w_n": 786.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2233.0, "completions/max_terminated_length": 2233.0, "completions/mean_length": 495.49609375, "completions/mean_terminated_length": 495.49609375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.14506666666666668, "grad_norm": 0.005031277425587177, "kl": 0.1255035400390625, "learning_rate": 1.777777777777778e-06, "loss": -0.007, "num_tokens": 32928611.0, "reward": 0.658591628074646, "reward_std": 0.2328774631023407, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7611265778541565, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.2638690769672394, "step": 136 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6497611865258924, "calib/avg_num_step_conf": 5.21484375, "calib/ece": 0.22667984189723317, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.30039525691699603, "calib/gap": 0.17296694318753136, "calib/mean_conf": 0.6133201581027667, "calib/mu_c": 0.6933088235294117, "calib/mu_w": 0.5203418803418803, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15122529644268773, "calib/std_conf": 0.3469324128412001, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.385613630406291, "calib/step_q_c_n": 763.0, "calib/step_q_gap": 0.010008560476221118, "calib/step_q_w": 0.37560506993006987, "calib/step_q_w_n": 572.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2087.0, "completions/max_terminated_length": 2087.0, "completions/mean_length": 489.1171875, "completions/mean_terminated_length": 491.0353088378906, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.14613333333333334, "grad_norm": 0.004626833368092775, "kl": 0.1271209716796875, "learning_rate": 1.75e-06, "loss": 0.0013, "num_tokens": 33160809.0, "reward": 0.6661921739578247, "reward_std": 0.2168254852294922, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7029668092727661, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.32551121711730957, "step": 137 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7155593093093093, "calib/avg_num_step_conf": 5.0703125, "calib/ece": 0.15832031250000006, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.2578125, "calib/gap": 0.23970220220220223, "calib/mean_conf": 0.5904296874999999, "calib/mu_c": 0.691554054054054, "calib/mu_w": 0.4518518518518518, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08531250000000007, "calib/std_conf": 0.34287988019662563, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4437465181058496, "calib/step_q_c_n": 718.0, "calib/step_q_gap": 0.07487927672653927, "calib/step_q_w": 0.3688672413793103, "calib/step_q_w_n": 580.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 502.1015625, "completions/mean_terminated_length": 504.07061767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.1472, "grad_norm": 0.005026695318520069, "kl": 0.1251373291015625, "learning_rate": 1.7222222222222224e-06, "loss": 0.0011, "num_tokens": 33393683.0, "reward": 0.6768547296524048, "reward_std": 0.22881154716014862, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7553105354309082, "rewards/format_reward_step": 1.0, "rewards/step_margin_reward": 0.2827739715576172, "step": 138 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7254982600442897, "calib/avg_num_step_conf": 4.76171875, "calib/ece": 0.16476377952755902, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2795275590551181, "calib/gap": 0.27408984498576405, "calib/mean_conf": 0.6055511811023622, "calib/mu_c": 0.7231724137931036, "calib/mu_w": 0.44908256880733954, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09972440944881886, "calib/std_conf": 0.35066569333605613, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4365808823529412, "calib/step_q_c_n": 680.0, "calib/step_q_gap": 0.027864926879842877, "calib/step_q_w": 0.4087159554730983, "calib/step_q_w_n": 539.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2811.0, "completions/max_terminated_length": 2811.0, "completions/mean_length": 458.19921875, "completions/mean_terminated_length": 459.99609375, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.14826666666666666, "grad_norm": 0.005129787139594555, "kl": 0.13323974609375, "learning_rate": 1.6944444444444446e-06, "loss": 0.0286, "num_tokens": 33614078.0, "reward": 0.6852666139602661, "reward_std": 0.2394963949918747, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7552605271339417, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.30433520674705505, "step": 139 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8103160842891437, "calib/avg_num_step_conf": 4.5703125, "calib/ece": 0.10023529411764698, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.3137254901960784, "calib/gap": 0.3649046412376634, "calib/mean_conf": 0.6422745098039215, "calib/mu_c": 0.7739263803680982, "calib/mu_w": 0.4090217391304348, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.051647058823529324, "calib/std_conf": 0.3423327173237316, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4144586070959264, "calib/step_q_c_n": 761.0, "calib/step_q_gap": -0.002594204640014919, "calib/step_q_w": 0.41705281173594133, "calib/step_q_w_n": 409.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 488.71484375, "completions/mean_terminated_length": 488.71484375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.14933333333333335, "grad_norm": 0.005066412966698408, "kl": 0.129180908203125, "learning_rate": 1.6666666666666667e-06, "loss": 0.0392, "num_tokens": 33844205.0, "reward": 0.7103152275085449, "reward_std": 0.2360602468252182, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.8172827959060669, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.27678507566452026, "step": 140 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7928881123406459, "calib/avg_num_step_conf": 4.64453125, "calib/ece": 0.10885826771653549, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2874015748031496, "calib/gap": 0.36308613214262603, "calib/mean_conf": 0.6105905511811023, "calib/mu_c": 0.754967320261438, "calib/mu_w": 0.39188118811881195, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05854330708661423, "calib/std_conf": 0.3468622648499856, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42326777456647396, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.048438800723415554, "calib/step_q_w": 0.3748289738430584, "calib/step_q_w_n": 497.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2038.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 529.62109375, "completions/mean_terminated_length": 529.62109375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.1504, "grad_norm": 0.004820066969841719, "kl": 0.1171722412109375, "learning_rate": 1.638888888888889e-06, "loss": 0.0115, "num_tokens": 34086884.0, "reward": 0.6914817094802856, "reward_std": 0.22751720249652863, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.8076714873313904, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.25732317566871643, "step": 141 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7499068554396424, "calib/avg_num_step_conf": 4.9609375, "calib/ece": 0.14440944881889772, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2440944881889764, "calib/gap": 0.3235059612518629, "calib/mean_conf": 0.5598425196850393, "calib/mu_c": 0.7152272727272727, "calib/mu_w": 0.39172131147540984, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09228346456692924, "calib/std_conf": 0.3597078522627466, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4225084745762712, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.05316870001910534, "calib/step_q_w": 0.3693397745571659, "calib/step_q_w_n": 621.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 509.4375, "completions/mean_terminated_length": 511.4353332519531, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.15146666666666667, "grad_norm": 0.005076853092759848, "kl": 0.125946044921875, "learning_rate": 1.6111111111111113e-06, "loss": -0.0051, "num_tokens": 34322460.0, "reward": 0.6526362895965576, "reward_std": 0.21421653032302856, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7747867107391357, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.22892338037490845, "step": 142 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7989877769289535, "calib/avg_num_step_conf": 5.3203125, "calib/ece": 0.11095617529880475, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.2589641434262948, "calib/gap": 0.3715457091927679, "calib/mean_conf": 0.5635458167330677, "calib/mu_c": 0.7396969696969696, "calib/mu_w": 0.3681512605042017, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07430278884462146, "calib/std_conf": 0.35467028750808266, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.414310294117647, "calib/step_q_c_n": 680.0, "calib/step_q_gap": 0.0897075570697487, "calib/step_q_w": 0.3246027370478983, "calib/step_q_w_n": 682.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2484.0, "completions/max_terminated_length": 2484.0, "completions/mean_length": 520.80078125, "completions/mean_terminated_length": 524.9015502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.15253333333333333, "grad_norm": 0.00469217961654067, "kl": 0.1230316162109375, "learning_rate": 1.5833333333333333e-06, "loss": -0.0088, "num_tokens": 34563121.0, "reward": 0.7117146253585815, "reward_std": 0.20838120579719543, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7929409742355347, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.33126944303512573, "step": 143 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6699095911949686, "calib/avg_num_step_conf": 4.92578125, "calib/ece": 0.20898039215686273, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.22745098039215686, "calib/gap": 0.22264937106918237, "calib/mean_conf": 0.5367450980392157, "calib/mu_c": 0.6205660377358491, "calib/mu_w": 0.3979166666666667, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.061098039215686274, "calib/std_conf": 0.35905430487395606, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.38659533742331287, "calib/step_q_c_n": 815.0, "calib/step_q_gap": -0.031072824011664724, "calib/step_q_w": 0.4176681614349776, "calib/step_q_w_n": 446.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2631.0, "completions/max_terminated_length": 2631.0, "completions/mean_length": 509.140625, "completions/mean_terminated_length": 509.140625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.1536, "grad_norm": 0.004818373825401068, "kl": 0.13165283203125, "learning_rate": 1.5555555555555558e-06, "loss": 0.0273, "num_tokens": 34797589.0, "reward": 0.672430157661438, "reward_std": 0.2170725166797638, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7267078161239624, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.29549625515937805, "step": 144 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6520072019205122, "calib/avg_num_step_conf": 5.19140625, "calib/ece": 0.19156862745098044, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2980392156862745, "calib/gap": 0.1672225926913844, "calib/mean_conf": 0.622, "calib/mu_c": 0.6823312883435583, "calib/mu_w": 0.5151086956521739, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0871764705882353, "calib/std_conf": 0.3363044838867792, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4111351351351351, "calib/step_q_c_n": 851.0, "calib/step_q_gap": 0.03382760375438204, "calib/step_q_w": 0.37730753138075307, "calib/step_q_w_n": 478.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1092.0, "completions/max_terminated_length": 1092.0, "completions/mean_length": 471.890625, "completions/mean_terminated_length": 473.7412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.15466666666666667, "grad_norm": 0.004987073130905628, "kl": 0.125579833984375, "learning_rate": 1.527777777777778e-06, "loss": 0.0031, "num_tokens": 35021097.0, "reward": 0.644719123840332, "reward_std": 0.2383957952260971, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7302496433258057, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.23262612521648407, "step": 145 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6666347381864624, "calib/avg_num_step_conf": 5.0390625, "calib/ece": 0.21276679841897234, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2608695652173913, "calib/gap": 0.23161430395913146, "calib/mean_conf": 0.5546640316205534, "calib/mu_c": 0.6874074074074074, "calib/mu_w": 0.4557931034482759, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17027667984189726, "calib/std_conf": 0.3571411322026893, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3992572463768116, "calib/step_q_c_n": 552.0, "calib/step_q_gap": 0.027237056674914606, "calib/step_q_w": 0.372020189701897, "calib/step_q_w_n": 738.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 523.33984375, "completions/mean_terminated_length": 523.33984375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.15573333333333333, "grad_norm": 0.005135540850460529, "kl": 0.1239166259765625, "learning_rate": 1.5e-06, "loss": -0.0058, "num_tokens": 35262288.0, "reward": 0.620847225189209, "reward_std": 0.20095868408679962, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.7163043022155762, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.2433588057756424, "step": 146 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6535714285714286, "calib/avg_num_step_conf": 5.19921875, "calib/ece": 0.22660079051383403, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2924901185770751, "calib/gap": 0.1993408521303257, "calib/mean_conf": 0.6015415019762845, "calib/mu_c": 0.7063333333333333, "calib/mu_w": 0.5069924812030076, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17691699604743083, "calib/std_conf": 0.35387018239051427, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43867449664429536, "calib/step_q_c_n": 596.0, "calib/step_q_gap": 0.08102551705245858, "calib/step_q_w": 0.3576489795918368, "calib/step_q_w_n": 735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 514.41015625, "completions/mean_terminated_length": 516.427490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.1568, "grad_norm": 0.005366252735257149, "kl": 0.1230010986328125, "learning_rate": 1.4722222222222225e-06, "loss": -0.0171, "num_tokens": 35497657.0, "reward": 0.6017385721206665, "reward_std": 0.2290113866329193, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7003504037857056, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.2101580798625946, "step": 147 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7547921071176885, "calib/avg_num_step_conf": 5.0546875, "calib/ece": 0.14605577689243032, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": 0.32399859055673014, "calib/mean_conf": 0.6243824701195219, "calib/mu_c": 0.7353939393939394, "calib/mu_w": 0.41139534883720924, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05653386454183271, "calib/std_conf": 0.34655492474931315, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4162940024479804, "calib/step_q_c_n": 817.0, "calib/step_q_gap": 0.0164826816932635, "calib/step_q_w": 0.3998113207547169, "calib/step_q_w_n": 477.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1845.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 483.390625, "completions/mean_terminated_length": 489.12255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.15786666666666666, "grad_norm": 0.005183129571378231, "kl": 0.1303863525390625, "learning_rate": 1.4444444444444445e-06, "loss": -0.0061, "num_tokens": 35726517.0, "reward": 0.7119890451431274, "reward_std": 0.21605724096298218, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7839125394821167, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.3150656521320343, "step": 148 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7620216836734693, "calib/avg_num_step_conf": 5.140625, "calib/ece": 0.12071428571428566, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.30952380952380953, "calib/gap": 0.34632142857142867, "calib/mean_conf": 0.5707936507936507, "calib/mu_c": 0.7247142857142858, "calib/mu_w": 0.3783928571428571, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.06797619047619043, "calib/std_conf": 0.3578653714550651, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4296190839694657, "calib/step_q_c_n": 655.0, "calib/step_q_gap": 0.10079003707082723, "calib/step_q_w": 0.32882904689863846, "calib/step_q_w_n": 661.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 543.54296875, "completions/mean_terminated_length": 545.674560546875, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.15893333333333334, "grad_norm": 0.0045463708229362965, "kl": 0.1185455322265625, "learning_rate": 1.4166666666666667e-06, "loss": 0.0153, "num_tokens": 35970120.0, "reward": 0.6921254396438599, "reward_std": 0.22323009371757507, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7795078158378601, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.29927438497543335, "step": 149 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7414935064935065, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.17824000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.372, "calib/gap": 0.29605844155844135, "calib/mean_conf": 0.64152, "calib/mu_c": 0.7717857142857142, "calib/mu_w": 0.47572727272727283, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12988000000000002, "calib/std_conf": 0.3545544945420943, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4243561643835616, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.0568515666824122, "calib/step_q_w": 0.3675045977011494, "calib/step_q_w_n": 609.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2667.0, "completions/max_terminated_length": 2667.0, "completions/mean_length": 484.84765625, "completions/mean_terminated_length": 484.84765625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.16, "grad_norm": 0.004933979362249374, "kl": 0.1362152099609375, "learning_rate": 1.3888888888888892e-06, "loss": 0.0212, "num_tokens": 36199201.0, "reward": 0.6484754085540771, "reward_std": 0.2582949995994568, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7452633380889893, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.2470000684261322, "step": 150 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7442561205273069, "calib/avg_num_step_conf": 4.78125, "calib/ece": 0.15478260869565222, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.25296442687747034, "calib/gap": 0.3273986189579412, "calib/mean_conf": 0.5369960474308301, "calib/mu_c": 0.7116949152542374, "calib/mu_w": 0.38429629629629625, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11268774703557316, "calib/std_conf": 0.3602179675892091, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40124333925399647, "calib/step_q_c_n": 563.0, "calib/step_q_gap": 0.04791353592570602, "calib/step_q_w": 0.35332980332829045, "calib/step_q_w_n": 661.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2731.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 530.87890625, "completions/mean_terminated_length": 532.9608154296875, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.16106666666666666, "grad_norm": 0.005048654042184353, "kl": 0.119140625, "learning_rate": 1.3611111111111112e-06, "loss": -0.0143, "num_tokens": 36442130.0, "reward": 0.6133000254631042, "reward_std": 0.21529051661491394, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.770215630531311, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.1665406972169876, "step": 151 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6082205029013539, "calib/avg_num_step_conf": 5.12109375, "calib/ece": 0.22011952191235057, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.17131474103585656, "calib/gap": 0.12902256608639584, "calib/mean_conf": 0.5024302788844621, "calib/mu_c": 0.5749090909090909, "calib/mu_w": 0.4458865248226951, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1421513944223107, "calib/std_conf": 0.3419373997941873, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4019067415730337, "calib/step_q_c_n": 534.0, "calib/step_q_gap": 0.04298010064639274, "calib/step_q_w": 0.35892664092664095, "calib/step_q_w_n": 777.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2653.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 512.16015625, "completions/mean_terminated_length": 514.1686401367188, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.16213333333333332, "grad_norm": 0.005053962115198374, "kl": 0.125335693359375, "learning_rate": 1.3333333333333334e-06, "loss": 0.005, "num_tokens": 36678635.0, "reward": 0.5785558223724365, "reward_std": 0.2497299313545227, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.6750144362449646, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.20162850618362427, "step": 152 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5855084636257915, "calib/avg_num_step_conf": 5.1484375, "calib/ece": 0.2675697211155379, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2350597609561753, "calib/gap": 0.10179286729551623, "calib/mean_conf": 0.5389641434262948, "calib/mu_c": 0.583169014084507, "calib/mu_w": 0.4813761467889908, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12039840637450201, "calib/std_conf": 0.3531132493149983, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3921118045476536, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.06117746432507809, "calib/step_q_w": 0.3309343402225755, "calib/step_q_w_n": 629.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2462.0, "completions/max_terminated_length": 2462.0, "completions/mean_length": 557.41015625, "completions/mean_terminated_length": 557.41015625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.1632, "grad_norm": 0.004702843725681305, "kl": 0.1187286376953125, "learning_rate": 1.3055555555555556e-06, "loss": 0.0577, "num_tokens": 36928652.0, "reward": 0.5983327627182007, "reward_std": 0.22896221280097961, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6617656350135803, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.2286498248577118, "step": 153 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6769323671497585, "calib/avg_num_step_conf": 4.734375, "calib/ece": 0.23519841269841268, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.24603174603174602, "calib/gap": 0.23345156369183856, "calib/mean_conf": 0.5424206349206349, "calib/mu_c": 0.6702631578947371, "calib/mu_w": 0.43681159420289856, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1626190476190476, "calib/std_conf": 0.3671210572621941, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41268858800773695, "calib/step_q_c_n": 517.0, "calib/step_q_gap": 0.04542887577752114, "calib/step_q_w": 0.3672597122302158, "calib/step_q_w_n": 695.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2787.0, "completions/max_terminated_length": 2787.0, "completions/mean_length": 513.32421875, "completions/mean_terminated_length": 513.32421875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.16426666666666667, "grad_norm": 42.83332824707031, "kl": 424.1183166503906, "learning_rate": 1.2777777777777779e-06, "loss": 4.2855, "num_tokens": 37164503.0, "reward": 0.5859673023223877, "reward_std": 0.23850896954536438, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.7137206792831421, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.17227637767791748, "step": 154 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6353689100800782, "calib/avg_num_step_conf": 5.35546875, "calib/ece": 0.261171875, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.2578125, "calib/gap": 0.17725900116144022, "calib/mean_conf": 0.529453125, "calib/mu_c": 0.6215447154471545, "calib/mu_w": 0.4442857142857143, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.155078125, "calib/std_conf": 0.3705766380625395, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3797499267935578, "calib/step_q_c_n": 683.0, "calib/step_q_gap": -0.014480063516519703, "calib/step_q_w": 0.3942299903100775, "calib/step_q_w_n": 688.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 469.2890625, "completions/mean_terminated_length": 471.1294250488281, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.16533333333333333, "grad_norm": 0.005444099195301533, "kl": 0.132110595703125, "learning_rate": 1.25e-06, "loss": 0.0071, "num_tokens": 37391857.0, "reward": 0.5964875817298889, "reward_std": 0.23414337635040283, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.6991492509841919, "rewards/format_reward_step": 1.0, "rewards/step_margin_reward": 0.19773223996162415, "step": 155 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7470858134920634, "calib/avg_num_step_conf": 5.390625, "calib/ece": 0.14956692913385827, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.23228346456692914, "calib/gap": 0.3048710317460318, "calib/mean_conf": 0.5268897637795276, "calib/mu_c": 0.6781250000000001, "calib/mu_w": 0.3732539682539683, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08625984251968505, "calib/std_conf": 0.3619340160063211, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40273932253313693, "calib/step_q_c_n": 679.0, "calib/step_q_gap": 0.05212591311801562, "calib/step_q_w": 0.3506134094151213, "calib/step_q_w_n": 701.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2337.0, "completions/max_terminated_length": 2337.0, "completions/mean_length": 524.4609375, "completions/mean_terminated_length": 526.5177001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.1664, "grad_norm": 0.004729734733700752, "kl": 0.13568115234375, "learning_rate": 1.2222222222222223e-06, "loss": 0.0215, "num_tokens": 37630879.0, "reward": 0.6934912204742432, "reward_std": 0.21397851407527924, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.7648956775665283, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.32364919781684875, "step": 156 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7214698963697628, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.16271653543307094, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.25984251968503935, "calib/gap": 0.2603566660308984, "calib/mean_conf": 0.6001181102362205, "calib/mu_c": 0.709795918367347, "calib/mu_w": 0.4494392523364486, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09204724409448822, "calib/std_conf": 0.35050299321788464, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4152710872162485, "calib/step_q_c_n": 837.0, "calib/step_q_gap": 0.04565204776513354, "calib/step_q_w": 0.36961903945111496, "calib/step_q_w_n": 583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2512.0, "completions/max_terminated_length": 2512.0, "completions/mean_length": 512.2890625, "completions/mean_terminated_length": 514.298095703125, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.16746666666666668, "grad_norm": 0.004471070598810911, "kl": 0.134033203125, "learning_rate": 1.1944444444444446e-06, "loss": 0.0115, "num_tokens": 37865753.0, "reward": 0.6392801403999329, "reward_std": 0.2263888418674469, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7539042830467224, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.21137472987174988, "step": 157 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6468207718207718, "calib/avg_num_step_conf": 5.31640625, "calib/ece": 0.2062745098039216, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.30980392156862746, "calib/gap": 0.17891414141414141, "calib/mean_conf": 0.5980392156862745, "calib/mu_c": 0.6675, "calib/mu_w": 0.48858585858585857, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09627450980392163, "calib/std_conf": 0.34570201158667146, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41464428457234215, "calib/step_q_c_n": 834.0, "calib/step_q_gap": 0.034258452820286556, "calib/step_q_w": 0.3803858317520556, "calib/step_q_w_n": 527.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1748.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 492.14453125, "completions/mean_terminated_length": 494.07452392578125, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.16853333333333334, "grad_norm": 0.004758656956255436, "kl": 0.15380859375, "learning_rate": 1.1666666666666668e-06, "loss": -0.0176, "num_tokens": 38096982.0, "reward": 0.6552407741546631, "reward_std": 0.21743571758270264, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.721720278263092, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.26844868063926697, "step": 158 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6775748351090817, "calib/avg_num_step_conf": 5.625, "calib/ece": 0.20212598425196854, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2677165354330709, "calib/gap": 0.21219558599695593, "calib/mean_conf": 0.5766929133858267, "calib/mu_c": 0.6669178082191781, "calib/mu_w": 0.4547222222222222, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10200787401574807, "calib/std_conf": 0.3529862595039625, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4086933842239186, "calib/step_q_c_n": 786.0, "calib/step_q_gap": 0.04906249737376567, "calib/step_q_w": 0.35963088685015293, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2944.0, "completions/max_terminated_length": 2944.0, "completions/mean_length": 503.87890625, "completions/mean_terminated_length": 503.87890625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.1696, "grad_norm": 0.004795048851519823, "kl": 0.15203857421875, "learning_rate": 1.138888888888889e-06, "loss": 0.0394, "num_tokens": 38330759.0, "reward": 0.6931055784225464, "reward_std": 0.20757821202278137, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.728975772857666, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.34473535418510437, "step": 159 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7036967418546366, "calib/avg_num_step_conf": 5.484375, "calib/ece": 0.20494071146245058, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.308300395256917, "calib/gap": 0.2683734335839598, "calib/mean_conf": 0.5654150197628458, "calib/mu_c": 0.6927067669172932, "calib/mu_w": 0.42433333333333334, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12233201581027668, "calib/std_conf": 0.3729959966666498, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3919554131054131, "calib/step_q_c_n": 702.0, "calib/step_q_gap": 0.0031860398860398265, "calib/step_q_w": 0.38876937321937327, "calib/step_q_w_n": 702.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 527.1875, "completions/mean_terminated_length": 529.2549438476562, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.17066666666666666, "grad_norm": 0.004639564547687769, "kl": 0.1502838134765625, "learning_rate": 1.111111111111111e-06, "loss": 0.0288, "num_tokens": 38570559.0, "reward": 0.6068136692047119, "reward_std": 0.2425975650548935, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7312051057815552, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.1808597892522812, "step": 160 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6673586829836828, "calib/avg_num_step_conf": 5.53125, "calib/ece": 0.20228346456692903, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.30708661417322836, "calib/gap": 0.20857080419580404, "calib/mean_conf": 0.6125984251968505, "calib/mu_c": 0.6766477272727273, "calib/mu_w": 0.4680769230769232, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06098425196850384, "calib/std_conf": 0.35160198641064827, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3988762641898865, "calib/step_q_c_n": 969.0, "calib/step_q_gap": -0.03246601768930818, "calib/step_q_w": 0.43134228187919466, "calib/step_q_w_n": 447.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2512.0, "completions/max_terminated_length": 2512.0, "completions/mean_length": 488.07421875, "completions/mean_terminated_length": 488.07421875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.17173333333333332, "grad_norm": 0.004807854071259499, "kl": 0.146820068359375, "learning_rate": 1.0833333333333335e-06, "loss": 0.0451, "num_tokens": 38799426.0, "reward": 0.678764283657074, "reward_std": 0.22198915481567383, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7400749921798706, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.28151601552963257, "step": 161 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6792058516196448, "calib/avg_num_step_conf": 5.265625, "calib/ece": 0.18742063492063482, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.31746031746031744, "calib/gap": 0.22275653082549635, "calib/mean_conf": 0.6286111111111111, "calib/mu_c": 0.7055151515151515, "calib/mu_w": 0.4827586206896552, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08063492063492056, "calib/std_conf": 0.35325590958159636, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42484174311926604, "calib/step_q_c_n": 872.0, "calib/step_q_gap": 0.03172451622850969, "calib/step_q_w": 0.39311722689075634, "calib/step_q_w_n": 476.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 483.18359375, "completions/mean_terminated_length": 483.18359375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.1728, "grad_norm": 0.005033192690461874, "kl": 0.164642333984375, "learning_rate": 1.0555555555555557e-06, "loss": 0.0649, "num_tokens": 39027265.0, "reward": 0.6548143625259399, "reward_std": 0.2401808500289917, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7367371320724487, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.24711044132709503, "step": 162 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6913576327152655, "calib/avg_num_step_conf": 5.87890625, "calib/ece": 0.2013147410358566, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.24701195219123506, "calib/gap": 0.2577959105918212, "calib/mean_conf": 0.5337450199203189, "calib/mu_c": 0.6611023622047245, "calib/mu_w": 0.4033064516129033, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11454183266932269, "calib/std_conf": 0.367372926678158, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4159095759233926, "calib/step_q_c_n": 731.0, "calib/step_q_gap": 0.035744201246390106, "calib/step_q_w": 0.3801653746770025, "calib/step_q_w_n": 774.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2913.0, "completions/max_terminated_length": 2913.0, "completions/mean_length": 557.71484375, "completions/mean_terminated_length": 557.71484375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.17386666666666667, "grad_norm": 0.004345105495303869, "kl": 0.148162841796875, "learning_rate": 1.0277777777777777e-06, "loss": 0.0341, "num_tokens": 39274872.0, "reward": 0.605945885181427, "reward_std": 0.23104888200759888, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.7246820330619812, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.19267842173576355, "step": 163 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7164532206169958, "calib/avg_num_step_conf": 5.55078125, "calib/ece": 0.18797619047619046, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.23015873015873015, "calib/gap": 0.2715998990599963, "calib/mean_conf": 0.5062301587301588, "calib/mu_c": 0.6366412213740459, "calib/mu_w": 0.36504132231404957, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08718253968253968, "calib/std_conf": 0.3661203649486512, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3894722955145119, "calib/step_q_c_n": 758.0, "calib/step_q_gap": 0.06453815775684474, "calib/step_q_w": 0.32493413775766716, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1733.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 563.26953125, "completions/mean_terminated_length": 565.4784545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.17493333333333333, "grad_norm": 0.004327528644353151, "kl": 0.1538543701171875, "learning_rate": 1.0000000000000002e-06, "loss": -0.014, "num_tokens": 39525205.0, "reward": 0.6413910388946533, "reward_std": 0.22969061136245728, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.740004301071167, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.2435591071844101, "step": 164 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6714582823586982, "calib/avg_num_step_conf": 5.6640625, "calib/ece": 0.21378906250000007, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.23828125, "calib/gap": 0.2234732077318327, "calib/mean_conf": 0.5459765625, "calib/mu_c": 0.6629508196721312, "calib/mu_w": 0.4394776119402985, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.14160156250000003, "calib/std_conf": 0.36246204466907095, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.41703323485967503, "calib/step_q_c_n": 677.0, "calib/step_q_gap": 0.018956477636734137, "calib/step_q_w": 0.3980767572229409, "calib/step_q_w_n": 773.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 540.61328125, "completions/mean_terminated_length": 542.7333374023438, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.176, "grad_norm": 0.0047814385034143925, "kl": 0.159088134765625, "learning_rate": 9.722222222222224e-07, "loss": 0.0187, "num_tokens": 39769178.0, "reward": 0.6197627782821655, "reward_std": 0.2257610261440277, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7219374775886536, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.22305673360824585, "step": 165 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7330788477577468, "calib/avg_num_step_conf": 5.54296875, "calib/ece": 0.15984087301587302, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.23015873015873015, "calib/gap": 0.3154504202219798, "calib/mean_conf": 0.5316670634920636, "calib/mu_c": 0.668111888111888, "calib/mu_w": 0.35266146788990826, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.06202380952380954, "calib/std_conf": 0.3670971265265847, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.40082370560547714, "calib/step_q_c_n": 779.0, "calib/step_q_gap": 0.04471630977214386, "calib/step_q_w": 0.3561073958333333, "calib/step_q_w_n": 640.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2072.0, "completions/max_terminated_length": 2072.0, "completions/mean_length": 541.83203125, "completions/mean_terminated_length": 543.9569091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.17706666666666668, "grad_norm": 0.004498959984630346, "kl": 0.1609344482421875, "learning_rate": 9.444444444444445e-07, "loss": 0.0236, "num_tokens": 40014071.0, "reward": 0.664745569229126, "reward_std": 0.26040273904800415, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7511249780654907, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.2728974223136902, "step": 166 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.559514687100894, "calib/avg_num_step_conf": 5.59765625, "calib/ece": 0.3013043478260869, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2845849802371542, "calib/gap": 0.07738761174968056, "calib/mean_conf": 0.5564822134387353, "calib/mu_c": 0.5895172413793103, "calib/mu_w": 0.5121296296296297, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14233201581027666, "calib/std_conf": 0.3699501577900132, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3994137931034483, "calib/step_q_c_n": 841.0, "calib/step_q_gap": -0.020249551491146267, "calib/step_q_w": 0.4196633445945946, "calib/step_q_w_n": 592.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1781.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 538.44921875, "completions/mean_terminated_length": 538.44921875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.17813333333333334, "grad_norm": 0.0044858017936348915, "kl": 0.15997314453125, "learning_rate": 9.166666666666666e-07, "loss": 0.0062, "num_tokens": 40257522.0, "reward": 0.589515209197998, "reward_std": 0.2210143506526947, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6483847498893738, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.21970805525779724, "step": 167 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7101552795031056, "calib/avg_num_step_conf": 5.453125, "calib/ece": 0.19098039215686277, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.23137254901960785, "calib/gap": 0.2726459627329192, "calib/mean_conf": 0.5204705882352941, "calib/mu_c": 0.6434285714285713, "calib/mu_w": 0.37078260869565216, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08121568627450981, "calib/std_conf": 0.37339220577166116, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4247226519337016, "calib/step_q_c_n": 724.0, "calib/step_q_gap": 0.061910151933701585, "calib/step_q_w": 0.36281250000000004, "calib/step_q_w_n": 672.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 532.8984375, "completions/mean_terminated_length": 534.98828125, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.1792, "grad_norm": 0.004624354187399149, "kl": 0.161590576171875, "learning_rate": 8.88888888888889e-07, "loss": 0.0188, "num_tokens": 40498616.0, "reward": 0.6443181037902832, "reward_std": 0.20038428902626038, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7442601919174194, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.23578235507011414, "step": 168 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6509328358208957, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.18314960629921248, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.25196850393700787, "calib/gap": 0.20415796019900495, "calib/mean_conf": 0.5636220472440945, "calib/mu_c": 0.6600746268656716, "calib/mu_w": 0.4559166666666667, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10960629921259832, "calib/std_conf": 0.34923238499102205, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.43017673469387757, "calib/step_q_c_n": 735.0, "calib/step_q_gap": 0.055638900657309365, "calib/step_q_w": 0.3745378340365682, "calib/step_q_w_n": 711.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2881.0, "completions/max_terminated_length": 2881.0, "completions/mean_length": 529.83984375, "completions/mean_terminated_length": 531.9176635742188, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.18026666666666666, "grad_norm": 0.005059287417680025, "kl": 0.16729736328125, "learning_rate": 8.611111111111112e-07, "loss": -0.0123, "num_tokens": 40738439.0, "reward": 0.6686310768127441, "reward_std": 0.24430686235427856, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.715927004814148, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.3197726011276245, "step": 169 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6846125186289121, "calib/avg_num_step_conf": 5.71484375, "calib/ece": 0.1486220472440945, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.28346456692913385, "calib/gap": 0.2398559364133135, "calib/mean_conf": 0.5755511811023621, "calib/mu_c": 0.6907575757575758, "calib/mu_w": 0.4509016393442623, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.102244094488189, "calib/std_conf": 0.3417455932855625, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4247980316645272, "calib/step_q_c_n": 779.0, "calib/step_q_gap": 0.0526141135358722, "calib/step_q_w": 0.372183918128655, "calib/step_q_w_n": 684.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 537.73046875, "completions/mean_terminated_length": 537.73046875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.18133333333333335, "grad_norm": 0.004425270017236471, "kl": 0.15838623046875, "learning_rate": 8.333333333333333e-07, "loss": 0.0072, "num_tokens": 40980250.0, "reward": 0.6666654348373413, "reward_std": 0.25412505865097046, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7407152652740479, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.29183441400527954, "step": 170 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6623986735445836, "calib/avg_num_step_conf": 5.76171875, "calib/ece": 0.2222656249999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.23828125, "calib/gap": 0.20489560304593468, "calib/mean_conf": 0.526328125, "calib/mu_c": 0.6367796610169492, "calib/mu_w": 0.4318840579710145, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14382812499999997, "calib/std_conf": 0.3605884320579133, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4277753472222222, "calib/step_q_c_n": 672.0, "calib/step_q_gap": 0.010422296163691724, "calib/step_q_w": 0.4173530510585305, "calib/step_q_w_n": 803.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1227.0, "completions/max_terminated_length": 1227.0, "completions/mean_length": 507.859375, "completions/mean_terminated_length": 509.85101318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.1824, "grad_norm": 0.004862784408032894, "kl": 0.1643218994140625, "learning_rate": 8.055555555555557e-07, "loss": 0.0235, "num_tokens": 41217158.0, "reward": 0.6161177158355713, "reward_std": 0.2292150855064392, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7190483808517456, "rewards/format_reward_step": 1.0, "rewards/step_margin_reward": 0.22099950909614563, "step": 171 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6468415937803693, "calib/avg_num_step_conf": 5.52734375, "calib/ece": 0.1923809523809524, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.27380952380952384, "calib/gap": 0.16925170068027195, "calib/mean_conf": 0.6268253968253968, "calib/mu_c": 0.6973469387755101, "calib/mu_w": 0.5280952380952382, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11793650793650795, "calib/std_conf": 0.32725566626268365, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4019522102747909, "calib/step_q_c_n": 837.0, "calib/step_q_gap": -0.008498770117365995, "calib/step_q_w": 0.4104509803921569, "calib/step_q_w_n": 578.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 513.92578125, "completions/mean_terminated_length": 515.9412231445312, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.18346666666666667, "grad_norm": 0.004686696920543909, "kl": 0.169036865234375, "learning_rate": 7.777777777777779e-07, "loss": 0.0545, "num_tokens": 41452075.0, "reward": 0.6795635223388672, "reward_std": 0.25730636715888977, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.714409351348877, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.33456137776374817, "step": 172 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6440162271805274, "calib/avg_num_step_conf": 5.87890625, "calib/ece": 0.20185119047619055, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.25, "calib/gap": 0.20437905679513196, "calib/mean_conf": 0.5958472222222223, "calib/mu_c": 0.6899264705882354, "calib/mu_w": 0.48554741379310346, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12900793650793657, "calib/std_conf": 0.3517430502018481, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.426232951653944, "calib/step_q_c_n": 786.0, "calib/step_q_gap": 0.08086007265533479, "calib/step_q_w": 0.3453728789986092, "calib/step_q_w_n": 719.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1909.0, "completions/max_terminated_length": 1909.0, "completions/mean_length": 561.88671875, "completions/mean_terminated_length": 564.0902099609375, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.18453333333333333, "grad_norm": 0.004296320024877787, "kl": 0.1624908447265625, "learning_rate": 7.5e-07, "loss": 0.0045, "num_tokens": 41699078.0, "reward": 0.5957044363021851, "reward_std": 0.2674137055873871, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7136157155036926, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.1754494607448578, "step": 173 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5652980132450331, "calib/avg_num_step_conf": 6.140625, "calib/ece": 0.2896573705179283, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.21912350597609562, "calib/gap": 0.07488543046357615, "calib/mean_conf": 0.48364940239043824, "calib/mu_c": 0.5287, "calib/mu_w": 0.4538145695364238, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18745019920318723, "calib/std_conf": 0.36043551095779086, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4055314183123878, "calib/step_q_c_n": 557.0, "calib/step_q_gap": 0.02268074507757667, "calib/step_q_w": 0.38285067323481115, "calib/step_q_w_n": 1015.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 611.9375, "completions/mean_terminated_length": 614.3372802734375, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.1856, "grad_norm": 0.004755265079438686, "kl": 0.1500396728515625, "learning_rate": 7.222222222222222e-07, "loss": -0.0174, "num_tokens": 41959966.0, "reward": 0.5307010412216187, "reward_std": 0.25636354088783264, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.6387794613838196, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.1499662697315216, "step": 174 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6603300330033004, "calib/avg_num_step_conf": 6.05859375, "calib/ece": 0.23573705179282872, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.1752988047808765, "calib/gap": 0.20679207920792086, "calib/mean_conf": 0.487211155378486, "calib/mu_c": 0.6107920792079208, "calib/mu_w": 0.4039999999999999, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16027888446215144, "calib/std_conf": 0.35651057777686285, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4259963099630996, "calib/step_q_c_n": 542.0, "calib/step_q_gap": 0.0851025868048571, "calib/step_q_w": 0.3408937231582425, "calib/step_q_w_n": 1009.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2187.0, "completions/max_terminated_length": 2187.0, "completions/mean_length": 558.203125, "completions/mean_terminated_length": 560.3922119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.18666666666666668, "grad_norm": 0.004802222829312086, "kl": 0.1537017822265625, "learning_rate": 6.944444444444446e-07, "loss": 0.0235, "num_tokens": 42208690.0, "reward": 0.5531376600265503, "reward_std": 0.22892574965953827, "rewards/accuracy_reward_step": 0.39453125, "rewards/final_brier_reward_step": 0.7105348110198975, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.1207406222820282, "step": 175 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6774114774114773, "calib/avg_num_step_conf": 5.90234375, "calib/ece": 0.21753906250000005, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.22265625, "calib/gap": 0.22457020757020763, "calib/mean_conf": 0.5219921875, "calib/mu_c": 0.6360317460317461, "calib/mu_w": 0.41146153846153843, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12367187499999999, "calib/std_conf": 0.35720422633832993, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.447039908045977, "calib/step_q_c_n": 725.0, "calib/step_q_gap": 0.05842192670585833, "calib/step_q_w": 0.38861798134011866, "calib/step_q_w_n": 786.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 521.58984375, "completions/mean_terminated_length": 523.6353149414062, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.18773333333333334, "grad_norm": 0.004365100525319576, "kl": 0.161163330078125, "learning_rate": 6.666666666666667e-07, "loss": 0.0072, "num_tokens": 42446281.0, "reward": 0.6350507140159607, "reward_std": 0.23502111434936523, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7338355183601379, "rewards/format_reward_step": 1.0, "rewards/step_margin_reward": 0.23782828450202942, "step": 176 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6791389680278569, "calib/avg_num_step_conf": 5.734375, "calib/ece": 0.20876984126984127, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.25, "calib/gap": 0.23434188034188025, "calib/mean_conf": 0.5348015873015873, "calib/mu_c": 0.6603418803418802, "calib/mu_w": 0.426, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13964285714285715, "calib/std_conf": 0.3649031730634391, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4506144356955381, "calib/step_q_c_n": 635.0, "calib/step_q_gap": 0.07512836126576622, "calib/step_q_w": 0.3754860744297719, "calib/step_q_w_n": 833.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2898.0, "completions/max_terminated_length": 2898.0, "completions/mean_length": 541.6171875, "completions/mean_terminated_length": 543.7412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 216.0, "epoch": 0.1888, "grad_norm": 0.004700418561697006, "kl": 0.1542205810546875, "learning_rate": 6.388888888888889e-07, "loss": 0.0144, "num_tokens": 42688767.0, "reward": 0.6129065155982971, "reward_std": 0.2510068416595459, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.7183199524879456, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.21921183168888092, "step": 177 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6999378689033862, "calib/avg_num_step_conf": 5.7890625, "calib/ece": 0.16547265625000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.21484375, "calib/gap": 0.2520751164958063, "calib/mean_conf": 0.53007421875, "calib/mu_c": 0.6393724137931035, "calib/mu_w": 0.3872972972972972, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.06457031249999999, "calib/std_conf": 0.35354642767651767, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4334288095238095, "calib/step_q_c_n": 840.0, "calib/step_q_gap": 0.07878654576472333, "calib/step_q_w": 0.3546422637590862, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 501.609375, "completions/mean_terminated_length": 503.5765075683594, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.18986666666666666, "grad_norm": 0.004584399983286858, "kl": 0.1641693115234375, "learning_rate": 6.111111111111112e-07, "loss": -0.0016, "num_tokens": 42923251.0, "reward": 0.6758027076721191, "reward_std": 0.22934101521968842, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7480090260505676, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.2910963296890259, "step": 178 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6706262807377048, "calib/avg_num_step_conf": 5.80078125, "calib/ece": 0.19084, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.204, "calib/gap": 0.19332351434426215, "calib/mean_conf": 0.5530799999999999, "calib/mu_c": 0.6474218749999999, "calib/mu_w": 0.4540983606557378, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11595999999999998, "calib/std_conf": 0.3392670240385882, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4167318361955086, "calib/step_q_c_n": 757.0, "calib/step_q_gap": 0.04583897905265144, "calib/step_q_w": 0.37089285714285714, "calib/step_q_w_n": 728.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2993.0, "completions/max_terminated_length": 2993.0, "completions/mean_length": 560.765625, "completions/mean_terminated_length": 560.765625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.19093333333333334, "grad_norm": 0.004369755275547504, "kl": 0.162078857421875, "learning_rate": 5.833333333333334e-07, "loss": 0.0654, "num_tokens": 43173071.0, "reward": 0.6135045289993286, "reward_std": 0.2431424856185913, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7089457511901855, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.22353218495845795, "step": 179 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6722066604419545, "calib/avg_num_step_conf": 5.73046875, "calib/ece": 0.18620734908136477, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2125984251968504, "calib/gap": 0.22150368295466327, "calib/mean_conf": 0.550249343832021, "calib/mu_c": 0.6540246913580247, "calib/mu_w": 0.4325210084033614, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10248031496062987, "calib/std_conf": 0.34653134212289904, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41154993514915694, "calib/step_q_c_n": 771.0, "calib/step_q_gap": 0.016650030934597604, "calib/step_q_w": 0.39489990421455934, "calib/step_q_w_n": 696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3060.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 583.76171875, "completions/mean_terminated_length": 586.051025390625, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.192, "grad_norm": 0.00433363439515233, "kl": 0.15771484375, "learning_rate": 5.555555555555555e-07, "loss": 0.0156, "num_tokens": 43426370.0, "reward": 0.6204407215118408, "reward_std": 0.229685440659523, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7350806593894958, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.20189449191093445, "step": 180 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7165070242656449, "calib/avg_num_step_conf": 5.98828125, "calib/ece": 0.141897233201581, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.17786561264822134, "calib/gap": 0.25479118773946385, "calib/mean_conf": 0.508695652173913, "calib/mu_c": 0.6547222222222224, "calib/mu_w": 0.3999310344827586, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11185770750988143, "calib/std_conf": 0.33311602012883124, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4376819078947368, "calib/step_q_c_n": 608.0, "calib/step_q_gap": 0.06188947546230439, "calib/step_q_w": 0.37579243243243243, "calib/step_q_w_n": 925.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2604.0, "completions/max_terminated_length": 2604.0, "completions/mean_length": 522.85546875, "completions/mean_terminated_length": 522.85546875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.19306666666666666, "grad_norm": 0.08254922181367874, "kl": 1.1282501220703125, "learning_rate": 5.277777777777779e-07, "loss": 0.0742, "num_tokens": 43666485.0, "reward": 0.616002082824707, "reward_std": 0.24907004833221436, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.7522082328796387, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.1985459327697754, "step": 181 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6683222958057395, "calib/avg_num_step_conf": 5.58984375, "calib/ece": 0.17320158102766797, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.17786561264822134, "calib/gap": 0.1990053239838982, "calib/mean_conf": 0.5536758893280633, "calib/mu_c": 0.6339072847682119, "calib/mu_w": 0.4349019607843137, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.06501976284584982, "calib/std_conf": 0.3407958682349704, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.41393501805054156, "calib/step_q_c_n": 831.0, "calib/step_q_gap": 0.05379612916165272, "calib/step_q_w": 0.36013888888888884, "calib/step_q_w_n": 600.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1441.0, "completions/max_terminated_length": 1441.0, "completions/mean_length": 523.234375, "completions/mean_terminated_length": 525.2863159179688, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.19413333333333332, "grad_norm": 0.004902321379631758, "kl": 0.167877197265625, "learning_rate": 5.000000000000001e-07, "loss": 0.0075, "num_tokens": 43906593.0, "reward": 0.671410858631134, "reward_std": 0.2326556146144867, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7256957292556763, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.30228227376937866, "step": 182 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6664479440069991, "calib/avg_num_step_conf": 5.48046875, "calib/ece": 0.21691699604743084, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.1857707509881423, "calib/gap": 0.20984876890388693, "calib/mean_conf": 0.4854545454545454, "calib/mu_c": 0.5907936507936506, "calib/mu_w": 0.3809448818897637, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10217391304347825, "calib/std_conf": 0.36479065807455824, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43924961597542245, "calib/step_q_c_n": 651.0, "calib/step_q_gap": 0.0876812205853515, "calib/step_q_w": 0.35156839539007095, "calib/step_q_w_n": 752.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 536.94140625, "completions/mean_terminated_length": 541.1693115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.1952, "grad_norm": 0.004713596776127815, "kl": 0.1636810302734375, "learning_rate": 4.7222222222222226e-07, "loss": 0.0155, "num_tokens": 44150730.0, "reward": 0.6190246939659119, "reward_std": 0.19719631969928741, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7093328237533569, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.2334040105342865, "step": 183 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.609497694129378, "calib/avg_num_step_conf": 5.50390625, "calib/ece": 0.2241094117647059, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.20392156862745098, "calib/gap": 0.1496872616228344, "calib/mean_conf": 0.5224003921568627, "calib/mu_c": 0.5887323943661973, "calib/mu_w": 0.4390451327433629, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09482352941176468, "calib/std_conf": 0.3562133593401506, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4167713058419244, "calib/step_q_c_n": 776.0, "calib/step_q_gap": 0.009820278985684328, "calib/step_q_w": 0.4069510268562401, "calib/step_q_w_n": 633.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 524.9140625, "completions/mean_terminated_length": 526.9725952148438, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.19626666666666667, "grad_norm": 0.004596102982759476, "kl": 0.1666717529296875, "learning_rate": 4.444444444444445e-07, "loss": -0.0068, "num_tokens": 44390388.0, "reward": 0.614298939704895, "reward_std": 0.2608785331249237, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6963027715682983, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.22213885188102722, "step": 184 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7102599268547544, "calib/avg_num_step_conf": 6.01171875, "calib/ece": 0.1720161290322581, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.27419354838709675, "calib/gap": 0.2664472309299896, "calib/mean_conf": 0.5761290322580644, "calib/mu_c": 0.7007575757575758, "calib/mu_w": 0.4343103448275862, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10794354838709683, "calib/std_conf": 0.36060032758429683, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42642047026279395, "calib/step_q_c_n": 723.0, "calib/step_q_gap": 0.08244007810593124, "calib/step_q_w": 0.3439803921568627, "calib/step_q_w_n": 816.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2745.0, "completions/max_terminated_length": 2745.0, "completions/mean_length": 549.328125, "completions/mean_terminated_length": 551.4823608398438, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.19733333333333333, "grad_norm": 0.004440247546881437, "kl": 0.1527557373046875, "learning_rate": 4.1666666666666667e-07, "loss": 0.0455, "num_tokens": 44637936.0, "reward": 0.5938361287117004, "reward_std": 0.23641090095043182, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7243554592132568, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.16644182801246643, "step": 185 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6337535014005603, "calib/avg_num_step_conf": 5.8828125, "calib/ece": 0.18752988047808766, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.23107569721115537, "calib/gap": 0.17142793481028779, "calib/mean_conf": 0.5533466135458168, "calib/mu_c": 0.6346212121212121, "calib/mu_w": 0.46319327731092436, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10749003984063746, "calib/std_conf": 0.3492233206242315, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.420696370967742, "calib/step_q_c_n": 744.0, "calib/step_q_gap": 0.04485267018034034, "calib/step_q_w": 0.37584370078740165, "calib/step_q_w_n": 762.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2601.0, "completions/max_terminated_length": 2601.0, "completions/mean_length": 548.3515625, "completions/mean_terminated_length": 550.5020141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.1984, "grad_norm": 0.0045608277432620525, "kl": 0.1638031005859375, "learning_rate": 3.8888888888888895e-07, "loss": 0.0158, "num_tokens": 44883354.0, "reward": 0.596014142036438, "reward_std": 0.24676115810871124, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6918773651123047, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.20249465107917786, "step": 186 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6417312661498709, "calib/avg_num_step_conf": 6.0859375, "calib/ece": 0.24040160642570285, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.26104417670682734, "calib/gap": 0.18380813953488379, "calib/mean_conf": 0.5526907630522089, "calib/mu_c": 0.6479166666666667, "calib/mu_w": 0.4641085271317829, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15558232931726912, "calib/std_conf": 0.35854512572862435, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3941972101972102, "calib/step_q_c_n": 693.0, "calib/step_q_gap": 0.028360524262720788, "calib/step_q_w": 0.3658366859344894, "calib/step_q_w_n": 865.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2515.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 558.8984375, "completions/mean_terminated_length": 561.0902099609375, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.19946666666666665, "grad_norm": 0.004232458770275116, "kl": 0.167236328125, "learning_rate": 3.611111111111111e-07, "loss": 0.0541, "num_tokens": 45127976.0, "reward": 0.6228945851325989, "reward_std": 0.23268939554691315, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.6891741752624512, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.2675524652004242, "step": 187 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6111542443064182, "calib/avg_num_step_conf": 5.90234375, "calib/ece": 0.22211999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.256, "calib/gap": 0.14758928571428576, "calib/mean_conf": 0.58388, "calib/mu_c": 0.65, "calib/mu_w": 0.5024107142857143, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.127, "calib/std_conf": 0.3454801667245169, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.422124347826087, "calib/step_q_c_n": 805.0, "calib/step_q_gap": 0.05691429116886321, "calib/step_q_w": 0.3652100566572238, "calib/step_q_w_n": 706.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2363.0, "completions/max_terminated_length": 2363.0, "completions/mean_length": 570.59375, "completions/mean_terminated_length": 577.3596801757812, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.20053333333333334, "grad_norm": 0.004320810083299875, "kl": 0.1486968994140625, "learning_rate": 3.3333333333333335e-07, "loss": 0.0141, "num_tokens": 45378120.0, "reward": 0.6399295330047607, "reward_std": 0.2552989721298218, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6887964606285095, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.287156343460083, "step": 188 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7073451602863368, "calib/avg_num_step_conf": 5.75390625, "calib/ece": 0.17826771653543305, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2283464566929134, "calib/gap": 0.2525477746654217, "calib/mean_conf": 0.5096062992125984, "calib/mu_c": 0.627925925925926, "calib/mu_w": 0.3753781512605043, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.07818897637795273, "calib/std_conf": 0.3587651647315927, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.40935324200913237, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.086860061210568, "calib/step_q_w": 0.32249318079856437, "calib/step_q_w_n": 743.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2301.0, "completions/max_terminated_length": 2301.0, "completions/mean_length": 544.44921875, "completions/mean_terminated_length": 544.44921875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.2016, "grad_norm": 0.004429060034453869, "kl": 0.1707916259765625, "learning_rate": 3.055555555555556e-07, "loss": 0.0343, "num_tokens": 45625267.0, "reward": 0.6407199501991272, "reward_std": 0.21337494254112244, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7381328344345093, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.2401820719242096, "step": 189 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7381419175309891, "calib/avg_num_step_conf": 5.9609375, "calib/ece": 0.14444444444444443, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.21031746031746032, "calib/gap": 0.3039615481912472, "calib/mean_conf": 0.510952380952381, "calib/mu_c": 0.6532835820895523, "calib/mu_w": 0.3493220338983051, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06182539682539681, "calib/std_conf": 0.3595774163755935, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4141890166028097, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.035158061017345366, "calib/step_q_w": 0.37903095558546435, "calib/step_q_w_n": 743.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2660.0, "completions/max_terminated_length": 2660.0, "completions/mean_length": 580.78515625, "completions/mean_terminated_length": 583.0628051757812, "completions/min_length": 0.0, "completions/min_terminated_length": 45.0, "epoch": 0.20266666666666666, "grad_norm": 0.004130993504077196, "kl": 0.1551055908203125, "learning_rate": 2.7777777777777776e-07, "loss": 0.0263, "num_tokens": 45879556.0, "reward": 0.6636803150177002, "reward_std": 0.24022971093654633, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.760574996471405, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.26522326469421387, "step": 190 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7302767052767052, "calib/avg_num_step_conf": 6.11328125, "calib/ece": 0.17864541832669323, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.30677290836653387, "calib/gap": 0.3050456885456885, "calib/mean_conf": 0.5969721115537848, "calib/mu_c": 0.767117117117117, "calib/mu_w": 0.4620714285714286, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16669322709163348, "calib/std_conf": 0.3553310493057434, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4486068111455109, "calib/step_q_c_n": 646.0, "calib/step_q_gap": 0.09276459134137593, "calib/step_q_w": 0.35584221980413494, "calib/step_q_w_n": 919.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2223.0, "completions/max_terminated_length": 2223.0, "completions/mean_length": 571.17578125, "completions/mean_terminated_length": 571.17578125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.20373333333333332, "grad_norm": 0.00448433356359601, "kl": 0.1585540771484375, "learning_rate": 2.5000000000000004e-07, "loss": 0.0019, "num_tokens": 46129945.0, "reward": 0.5969818830490112, "reward_std": 0.2233514040708542, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.7355577945709229, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.17637458443641663, "step": 191 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7258225324027916, "calib/avg_num_step_conf": 5.46484375, "calib/ece": 0.16803149606299206, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.20866141732283464, "calib/gap": 0.28928339980059814, "calib/mean_conf": 0.5285826771653543, "calib/mu_c": 0.6834745762711864, "calib/mu_w": 0.3941911764705882, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11602362204724402, "calib/std_conf": 0.36059601669870744, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43136098654708527, "calib/step_q_c_n": 669.0, "calib/step_q_gap": 0.05121139750598935, "calib/step_q_w": 0.3801495890410959, "calib/step_q_w_n": 730.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 550.34765625, "completions/mean_terminated_length": 550.34765625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.2048, "grad_norm": 0.004605600144714117, "kl": 0.1724395751953125, "learning_rate": 2.2222222222222224e-07, "loss": 0.0644, "num_tokens": 46375810.0, "reward": 0.6355305910110474, "reward_std": 0.22681453824043274, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7550976276397705, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.22533850371837616, "step": 192 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6440483036227717, "calib/avg_num_step_conf": 5.71484375, "calib/ece": 0.2073968253968254, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.1984126984126984, "calib/gap": 0.1773064979873491, "calib/mean_conf": 0.48866666666666664, "calib/mu_c": 0.5878738738738739, "calib/mu_w": 0.4105673758865248, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12779365079365085, "calib/std_conf": 0.3521201544566899, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40478499210110586, "calib/step_q_c_n": 633.0, "calib/step_q_gap": 0.037604269209539576, "calib/step_q_w": 0.3671807228915663, "calib/step_q_w_n": 830.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 533.0546875, "completions/mean_terminated_length": 535.1451416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 39.0, "epoch": 0.20586666666666667, "grad_norm": 0.004652594216167927, "kl": 0.166473388671875, "learning_rate": 1.9444444444444447e-07, "loss": -0.0153, "num_tokens": 46617984.0, "reward": 0.6170186996459961, "reward_std": 0.25677111744880676, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.7034628391265869, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.24619954824447632, "step": 193 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7412958480729845, "calib/avg_num_step_conf": 5.98046875, "calib/ece": 0.1687401574803149, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.22440944881889763, "calib/gap": 0.3154831502513498, "calib/mean_conf": 0.5441732283464566, "calib/mu_c": 0.6969465648854961, "calib/mu_w": 0.3814634146341463, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09858267716535431, "calib/std_conf": 0.3682592397072139, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.40561673202614373, "calib/step_q_c_n": 765.0, "calib/step_q_gap": 0.04993387736992094, "calib/step_q_w": 0.3556828546562228, "calib/step_q_w_n": 766.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3067.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 525.4140625, "completions/mean_terminated_length": 525.4140625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.20693333333333333, "grad_norm": 0.004900243133306503, "kl": 0.1553802490234375, "learning_rate": 1.6666666666666668e-07, "loss": 0.0207, "num_tokens": 46858434.0, "reward": 0.6249498724937439, "reward_std": 0.22597366571426392, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7653836011886597, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.18373483419418335, "step": 194 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6280734516028634, "calib/avg_num_step_conf": 5.71875, "calib/ece": 0.24606299212598423, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.1968503937007874, "calib/gap": 0.15394211017740417, "calib/mean_conf": 0.49456692913385825, "calib/mu_c": 0.5763865546218486, "calib/mu_w": 0.42244444444444446, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13606299212598424, "calib/std_conf": 0.35563352787724095, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41319080291970806, "calib/step_q_c_n": 685.0, "calib/step_q_gap": 0.029657683535882662, "calib/step_q_w": 0.3835331193838254, "calib/step_q_w_n": 779.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1285.0, "completions/max_terminated_length": 1285.0, "completions/mean_length": 512.2421875, "completions/mean_terminated_length": 514.2510375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.208, "grad_norm": 0.005007847677916288, "kl": 0.173828125, "learning_rate": 1.3888888888888888e-07, "loss": -0.0025, "num_tokens": 47095552.0, "reward": 0.6037704944610596, "reward_std": 0.20243516564369202, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6950304508209229, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.221104234457016, "step": 195 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6890919158361017, "calib/avg_num_step_conf": 5.43359375, "calib/ece": 0.17752941176470585, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.30980392156862746, "calib/gap": 0.2530952380952382, "calib/mean_conf": 0.6149411764705882, "calib/mu_c": 0.7400000000000001, "calib/mu_w": 0.4869047619047619, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14329411764705882, "calib/std_conf": 0.34986705664309997, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4632410886319847, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.03366550446412592, "calib/step_q_w": 0.4295755841678588, "calib/step_q_w_n": 699.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2328.0, "completions/max_terminated_length": 2328.0, "completions/mean_length": 464.1953125, "completions/mean_terminated_length": 464.1953125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.20906666666666668, "grad_norm": 0.004955775570124388, "kl": 0.1866455078125, "learning_rate": 1.1111111111111112e-07, "loss": 0.0534, "num_tokens": 47316930.0, "reward": 0.638756275177002, "reward_std": 0.2346639633178711, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7393644452095032, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.2381480634212494, "step": 196 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6842257318952234, "calib/avg_num_step_conf": 5.91015625, "calib/ece": 0.1882399999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.18, "calib/gap": 0.22706728299948636, "calib/mean_conf": 0.5196000000000001, "calib/mu_c": 0.6394915254237288, "calib/mu_w": 0.4124242424242424, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11791999999999993, "calib/std_conf": 0.34875297848190484, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42947526236881556, "calib/step_q_c_n": 667.0, "calib/step_q_gap": 0.051351030690328536, "calib/step_q_w": 0.378124231678487, "calib/step_q_w_n": 846.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 567.01171875, "completions/mean_terminated_length": 573.7352294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.21013333333333334, "grad_norm": 0.004286373499780893, "kl": 0.16259765625, "learning_rate": 8.333333333333334e-08, "loss": 0.0174, "num_tokens": 47567141.0, "reward": 0.6364597082138062, "reward_std": 0.2432287335395813, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7226440906524658, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.26355651021003723, "step": 197 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7110732009925559, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.1606299212598425, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.23622047244094488, "calib/gap": 0.27094044665012407, "calib/mean_conf": 0.5249606299212598, "calib/mu_c": 0.6572307692307692, "calib/mu_w": 0.3862903225806451, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.08688976377952755, "calib/std_conf": 0.3550953604110169, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.41581380563124426, "calib/step_q_c_n": 734.0, "calib/step_q_gap": 0.02308978877731166, "calib/step_q_w": 0.3927240168539326, "calib/step_q_w_n": 712.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2404.0, "completions/max_terminated_length": 2404.0, "completions/mean_length": 496.15234375, "completions/mean_terminated_length": 498.09808349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.2112, "grad_norm": 0.005007683299481869, "kl": 0.168792724609375, "learning_rate": 5.555555555555556e-08, "loss": 0.0305, "num_tokens": 47799540.0, "reward": 0.6415027976036072, "reward_std": 0.23499776422977448, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7438390254974365, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.2415103018283844, "step": 198 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.663230457880091, "calib/avg_num_step_conf": 5.54296875, "calib/ece": 0.2093253968253968, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.29365079365079366, "calib/gap": 0.21554389071591207, "calib/mean_conf": 0.5963095238095237, "calib/mu_c": 0.6972388059701493, "calib/mu_w": 0.48169491525423724, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13694444444444442, "calib/std_conf": 0.3578100712000072, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4378330110497237, "calib/step_q_c_n": 724.0, "calib/step_q_gap": 0.06646610457490354, "calib/step_q_w": 0.3713669064748202, "calib/step_q_w_n": 695.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2816.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 552.94140625, "completions/mean_terminated_length": 555.10986328125, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.21226666666666666, "grad_norm": 0.0044085741974413395, "kl": 0.1686553955078125, "learning_rate": 2.777777777777778e-08, "loss": 0.0059, "num_tokens": 48045293.0, "reward": 0.6249779462814331, "reward_std": 0.28111791610717773, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7144218683242798, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.23475277423858643, "step": 199 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7524773937817416, "calib/avg_num_step_conf": 5.58203125, "calib/ece": 0.11980392156862743, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2235294117647059, "calib/gap": 0.30327015979189903, "calib/mean_conf": 0.5554901960784313, "calib/mu_c": 0.6946376811594204, "calib/mu_w": 0.39136752136752134, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06705882352941178, "calib/std_conf": 0.34173211458564623, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43393265398550723, "calib/step_q_c_n": 736.0, "calib/step_q_gap": 0.023146218199071533, "calib/step_q_w": 0.4107864357864357, "calib/step_q_w_n": 693.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1414.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 537.23828125, "completions/mean_terminated_length": 539.3451538085938, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.21333333333333335, "grad_norm": 0.004761539865285158, "kl": 0.1644439697265625, "learning_rate": 0.0, "loss": 0.0154, "num_tokens": 48290874.0, "reward": 0.7025998830795288, "reward_std": 0.21370618045330048, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.782248854637146, "rewards/format_reward_step": 0.99609375, "rewards/step_margin_reward": 0.31591975688934326, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 0.03833882060367614, "train_runtime": 12677.532, "train_samples_per_second": 4.039, "train_steps_per_second": 0.016 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 48290874, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }