{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5086206896551725, "calib/avg_num_step_conf": 7.875, "calib/ece": 0.2888991935483871, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001713264989126051, "calib/mean_conf": 0.9905120967741936, "calib/mu_c": 0.9905632183908043, "calib/mu_w": 0.9903918918918917, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2888991935483871, "calib/std_conf": 0.0021794159006610276, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9119477557027226, "calib/step_q_c_n": 1359.0, "calib/step_q_gap": 0.0056311651395566376, "calib/step_q_w": 0.9063165905631659, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 755.49609375, "completions/mean_terminated_length": 776.7349243164062, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.0010666666666666667, "grad_norm": 0.00794170517474413, "kl": 0.0005849599838256836, "learning_rate": 2.5000000000000004e-07, "loss": -0.0572, "num_tokens": 300991.0, "reward": 0.8751538991928101, "reward_std": 0.2377150058746338, "rewards/accuracy_reward_step": 0.6796875, "rewards/asymmetric_l2_reward": 0.7354698181152344, "rewards/final_brier_reward_step": 0.6851503849029541, "rewards/format_reward_step": 0.96875, "step": 1 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4872611464968153, "calib/avg_num_step_conf": 7.6953125, "calib/ece": 0.36465737051792824, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00024481637078155316, "calib/mean_conf": 0.9901553784860557, "calib/mu_c": 0.990063694267516, "calib/mu_w": 0.9903085106382975, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36465737051792824, "calib/std_conf": 0.001222205307190084, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9075405636208369, "calib/step_q_c_n": 1171.0, "calib/step_q_gap": -0.003804868168900244, "calib/step_q_w": 0.9113454317897371, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2743.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 840.640625, "completions/mean_terminated_length": 850.6087036132812, "completions/min_length": 0.0, "completions/min_terminated_length": 466.0, "epoch": 0.0021333333333333334, "grad_norm": 0.006795755121856928, "kl": 0.0016820430755615234, "learning_rate": 5.000000000000001e-07, "loss": 0.0266, "num_tokens": 619483.0, "reward": 0.8191705346107483, "reward_std": 0.21779605746269226, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.6992889642715454, "rewards/final_brier_reward_step": 0.6203019618988037, "rewards/format_reward_step": 0.98046875, "step": 2 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4995745887691435, "calib/avg_num_step_conf": 7.703125, "calib/ece": 0.31306692913385814, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 8.508224618175575e-07, "calib/mean_conf": 0.9902322834645668, "calib/mu_c": 0.9902325581395348, "calib/mu_w": 0.990231707317073, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31306692913385814, "calib/std_conf": 0.0014944718019728367, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9121462264150944, "calib/step_q_c_n": 1272.0, "calib/step_q_gap": -0.0028009164420484955, "calib/step_q_w": 0.9149471428571428, "calib/step_q_w_n": 700.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2224.0, "completions/max_terminated_length": 2224.0, "completions/mean_length": 802.05859375, "completions/mean_terminated_length": 805.2039794921875, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.0032, "grad_norm": 0.0075466628186404705, "kl": 0.0005320906639099121, "learning_rate": 7.5e-07, "loss": 0.0078, "num_tokens": 930066.0, "reward": 0.8693341016769409, "reward_std": 0.2241458147764206, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.7278196811676025, "rewards/final_brier_reward_step": 0.6780359148979187, "rewards/format_reward_step": 0.9921875, "step": 3 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5029940119760479, "calib/avg_num_step_conf": 8.01953125, "calib/ece": 0.3273412698412699, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.988023952108179e-05, "calib/mean_conf": 0.9900396825396826, "calib/mu_c": 0.9900598802395209, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3273412698412699, "calib/std_conf": 0.0006286896634029713, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9098512137823024, "calib/step_q_c_n": 1277.0, "calib/step_q_gap": -0.007713219207388322, "calib/step_q_w": 0.9175644329896907, "calib/step_q_w_n": 776.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2714.0, "completions/max_terminated_length": 2714.0, "completions/mean_length": 781.55859375, "completions/mean_terminated_length": 793.96435546875, "completions/min_length": 0.0, "completions/min_terminated_length": 367.0, "epoch": 0.004266666666666667, "grad_norm": 0.007215971127152443, "kl": 0.0005492568016052246, "learning_rate": 1.0000000000000002e-06, "loss": -0.0391, "num_tokens": 1236313.0, "reward": 0.8469743728637695, "reward_std": 0.21102729439735413, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.707718551158905, "rewards/final_brier_reward_step": 0.6588863134384155, "rewards/format_reward_step": 0.984375, "step": 4 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5189616967394746, "calib/avg_num_step_conf": 7.52734375, "calib/ece": 0.4544007936507938, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00037435897435889043, "calib/mean_conf": 0.9901150793650795, "calib/mu_c": 0.9902888888888887, "calib/mu_w": 0.9899145299145298, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4544007936507938, "calib/std_conf": 0.0013767668279207968, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9129371980676328, "calib/step_q_c_n": 1035.0, "calib/step_q_gap": 0.011614328112475869, "calib/step_q_w": 0.901322869955157, "calib/step_q_w_n": 892.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2478.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 822.24609375, "completions/mean_terminated_length": 831.99609375, "completions/min_length": 0.0, "completions/min_terminated_length": 451.0, "epoch": 0.005333333333333333, "grad_norm": 0.006809460464864969, "kl": 0.0006694197654724121, "learning_rate": 1.25e-06, "loss": -0.0171, "num_tokens": 1553496.0, "reward": 0.7390369772911072, "reward_std": 0.20035284757614136, "rewards/accuracy_reward_step": 0.52734375, "rewards/asymmetric_l2_reward": 0.6392657160758972, "rewards/final_brier_reward_step": 0.5364644527435303, "rewards/format_reward_step": 0.984375, "step": 5 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.502243039978889, "calib/avg_num_step_conf": 8.32421875, "calib/ece": 0.4158192771084338, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.852751022559442e-05, "calib/mean_conf": 0.9901164658634539, "calib/mu_c": 0.990132867132867, "calib/mu_w": 0.9900943396226414, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4158192771084338, "calib/std_conf": 0.001055911813966895, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9112607573149744, "calib/step_q_c_n": 1162.0, "calib/step_q_gap": -0.006603019774809038, "calib/step_q_w": 0.9178637770897834, "calib/step_q_w_n": 969.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2701.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 739.35546875, "completions/mean_terminated_length": 748.12255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 449.0, "epoch": 0.0064, "grad_norm": 0.007401874754577875, "kl": 0.000727236270904541, "learning_rate": 1.5e-06, "loss": 0.01, "num_tokens": 1848723.0, "reward": 0.7569711208343506, "reward_std": 0.20083239674568176, "rewards/accuracy_reward_step": 0.55859375, "rewards/asymmetric_l2_reward": 0.6409914493560791, "rewards/final_brier_reward_step": 0.5667007565498352, "rewards/format_reward_step": 0.97265625, "step": 6 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.489010989010989, "calib/avg_num_step_conf": 7.2109375, "calib/ece": 0.35118650793650796, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00020879120879124802, "calib/mean_conf": 0.9900753968253968, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9902087912087911, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35118650793650796, "calib/std_conf": 0.0008441381918702945, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9099402220324508, "calib/step_q_c_n": 1171.0, "calib/step_q_gap": 0.0027742961065249094, "calib/step_q_w": 0.9071659259259259, "calib/step_q_w_n": 675.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2497.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 868.70703125, "completions/mean_terminated_length": 872.11376953125, "completions/min_length": 0.0, "completions/min_terminated_length": 423.0, "epoch": 0.007466666666666667, "grad_norm": 0.007204503286629915, "kl": 0.0005515217781066895, "learning_rate": 1.75e-06, "loss": 0.0165, "num_tokens": 2178536.0, "reward": 0.8298871517181396, "reward_std": 0.2348572164773941, "rewards/accuracy_reward_step": 0.62890625, "rewards/asymmetric_l2_reward": 0.701348602771759, "rewards/final_brier_reward_step": 0.6357694864273071, "rewards/format_reward_step": 0.984375, "step": 7 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5087550028587764, "calib/avg_num_step_conf": 7.56640625, "calib/ece": 0.34198380566801617, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9919028340080972, "calib/gap": 0.012221983990851504, "calib/mean_conf": 0.9857085020242914, "calib/mu_c": 0.9900628930817608, "calib/mu_w": 0.9778409090909093, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34198380566801617, "calib/std_conf": 0.06311261742570078, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9095822890559733, "calib/step_q_c_n": 1197.0, "calib/step_q_gap": 0.00013634311002730382, "calib/step_q_w": 0.909445945945946, "calib/step_q_w_n": 740.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2612.0, "completions/max_terminated_length": 2612.0, "completions/mean_length": 835.70703125, "completions/mean_terminated_length": 852.3546142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.008533333333333334, "grad_norm": 0.006909141317009926, "kl": 0.0005980730056762695, "learning_rate": 2.0000000000000003e-06, "loss": -0.0162, "num_tokens": 2498989.0, "reward": 0.826758086681366, "reward_std": 0.1921631246805191, "rewards/accuracy_reward_step": 0.62109375, "rewards/asymmetric_l2_reward": 0.7040407657623291, "rewards/final_brier_reward_step": 0.632287859916687, "rewards/format_reward_step": 0.96484375, "step": 8 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.49755799755799757, "calib/avg_num_step_conf": 8.44921875, "calib/ece": 0.2472653061224489, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -4.8840048840537165e-05, "calib/mean_conf": 0.9901224489795918, "calib/mu_c": 0.9901098901098899, "calib/mu_w": 0.9901587301587305, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2472653061224489, "calib/std_conf": 0.0010997709049230609, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8988439716312056, "calib/step_q_c_n": 1410.0, "calib/step_q_gap": 0.0900657511796783, "calib/step_q_w": 0.8087782204515273, "calib/step_q_w_n": 753.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3003.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 776.78515625, "completions/mean_terminated_length": 808.3617553710938, "completions/min_length": 0.0, "completions/min_terminated_length": 430.0, "epoch": 0.0096, "grad_norm": 0.007497426588088274, "kl": 0.0006467700004577637, "learning_rate": 2.25e-06, "loss": -0.0483, "num_tokens": 2805382.0, "reward": 0.883492112159729, "reward_std": 0.24604354798793793, "rewards/accuracy_reward_step": 0.7109375, "rewards/asymmetric_l2_reward": 0.7177037000656128, "rewards/final_brier_reward_step": 0.7156867384910583, "rewards/format_reward_step": 0.95703125, "step": 9 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.49134020618556706, "calib/avg_num_step_conf": 7.6875, "calib/ece": 0.3828906882591092, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00011408934707923812, "calib/mean_conf": 0.9901781376518217, "calib/mu_c": 0.9901333333333332, "calib/mu_w": 0.9902474226804124, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3828906882591092, "calib/std_conf": 0.0012696423407242182, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9113660245183888, "calib/step_q_c_n": 1142.0, "calib/step_q_gap": 0.005036726697565608, "calib/step_q_w": 0.9063292978208232, "calib/step_q_w_n": 826.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2945.0, "completions/max_terminated_length": 2945.0, "completions/mean_length": 844.6796875, "completions/mean_terminated_length": 861.5059814453125, "completions/min_length": 0.0, "completions/min_terminated_length": 475.0, "epoch": 0.010666666666666666, "grad_norm": 0.007133356295526028, "kl": 0.0006768107414245605, "learning_rate": 2.5e-06, "loss": 0.0149, "num_tokens": 3128420.0, "reward": 0.7908077239990234, "reward_std": 0.26634955406188965, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.678225576877594, "rewards/final_brier_reward_step": 0.5932334661483765, "rewards/format_reward_step": 0.96484375, "step": 10 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.49809211407149556, "calib/avg_num_step_conf": 7.453125, "calib/ece": 0.3765338645418326, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -3.815771857029748e-05, "calib/mean_conf": 0.9900796812749003, "calib/mu_c": 0.9900649350649349, "calib/mu_w": 0.9901030927835052, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3765338645418326, "calib/std_conf": 0.0008890802232837218, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9106165540540541, "calib/step_q_c_n": 1184.0, "calib/step_q_gap": 0.00452539383305961, "calib/step_q_w": 0.9060911602209945, "calib/step_q_w_n": 724.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2988.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 862.33203125, "completions/mean_terminated_length": 869.1220703125, "completions/min_length": 0.0, "completions/min_terminated_length": 424.0, "epoch": 0.011733333333333333, "grad_norm": 0.006749256979674101, "kl": 0.0007430911064147949, "learning_rate": 2.7500000000000004e-06, "loss": 0.0249, "num_tokens": 3453657.0, "reward": 0.817877471446991, "reward_std": 0.18740509450435638, "rewards/accuracy_reward_step": 0.6015625, "rewards/asymmetric_l2_reward": 0.7112424373626709, "rewards/final_brier_reward_step": 0.6088874340057373, "rewards/format_reward_step": 0.9765625, "step": 11 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.4921582605596151, "calib/avg_num_step_conf": 8.31640625, "calib/ece": 0.24534979423868308, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00015683478880823554, "calib/mean_conf": 0.9902057613168724, "calib/mu_c": 0.9901657458563534, "calib/mu_w": 0.9903225806451617, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24534979423868308, "calib/std_conf": 0.001419603976186038, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9127608554763449, "calib/step_q_c_n": 1543.0, "calib/step_q_gap": 0.012481845237436895, "calib/step_q_w": 0.900279010238908, "calib/step_q_w_n": 586.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2540.0, "completions/max_terminated_length": 2540.0, "completions/mean_length": 789.0234375, "completions/mean_terminated_length": 811.2047729492188, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.0128, "grad_norm": 0.007351956330239773, "kl": 0.0010945796966552734, "learning_rate": 3e-06, "loss": -0.0461, "num_tokens": 3759823.0, "reward": 0.9017115831375122, "reward_std": 0.22140005230903625, "rewards/accuracy_reward_step": 0.70703125, "rewards/asymmetric_l2_reward": 0.7605472803115845, "rewards/final_brier_reward_step": 0.7116257548332214, "rewards/format_reward_step": 0.94921875, "step": 12 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.49408326204733394, "calib/avg_num_step_conf": 8.19921875, "calib/ece": 0.324820717131474, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00011833475905331792, "calib/mean_conf": 0.9901593625498006, "calib/mu_c": 0.9901197604790417, "calib/mu_w": 0.990238095238095, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.324820717131474, "calib/std_conf": 0.0012522895335061138, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9110180995475113, "calib/step_q_c_n": 1326.0, "calib/step_q_gap": 0.0008887334414311443, "calib/step_q_w": 0.9101293661060802, "calib/step_q_w_n": 773.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1971.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 825.59375, "completions/mean_terminated_length": 838.698486328125, "completions/min_length": 0.0, "completions/min_terminated_length": 333.0, "epoch": 0.013866666666666666, "grad_norm": 0.008176865056157112, "kl": 0.010606169700622559, "learning_rate": 3.2500000000000002e-06, "loss": -0.0137, "num_tokens": 4075767.0, "reward": 0.8516587018966675, "reward_std": 0.20331385731697083, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.7181015014648438, "rewards/final_brier_reward_step": 0.6586534976959229, "rewards/format_reward_step": 0.98046875, "step": 13 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5079258841764125, "calib/avg_num_step_conf": 7.65234375, "calib/ece": 0.3738775510204082, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00015851768352848072, "calib/mean_conf": 0.990204081632653, "calib/mu_c": 0.9902649006622515, "calib/mu_w": 0.990106382978723, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3738775510204082, "calib/std_conf": 0.00141391902658684, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9120733788395907, "calib/step_q_c_n": 1172.0, "calib/step_q_gap": -0.00030273297743588223, "calib/step_q_w": 0.9123761118170266, "calib/step_q_w_n": 787.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2835.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 861.015625, "completions/mean_terminated_length": 888.790283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 460.0, "epoch": 0.014933333333333333, "grad_norm": 0.006741903256624937, "kl": 0.0025125741958618164, "learning_rate": 3.5e-06, "loss": -0.0444, "num_tokens": 4401587.0, "reward": 0.800338864326477, "reward_std": 0.23323848843574524, "rewards/accuracy_reward_step": 0.58984375, "rewards/asymmetric_l2_reward": 0.6942870616912842, "rewards/final_brier_reward_step": 0.5970156192779541, "rewards/format_reward_step": 0.95703125, "step": 14 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.48835025733759907, "calib/avg_num_step_conf": 8.3828125, "calib/ece": 0.3558634538152611, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002329948532481252, "calib/mean_conf": 0.9904016064257029, "calib/mu_c": 0.9903164556962023, "calib/mu_w": 0.9905494505494504, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3558634538152611, "calib/std_conf": 0.001963358483788004, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9112529002320185, "calib/step_q_c_n": 1293.0, "calib/step_q_gap": -0.004925294375249911, "calib/step_q_w": 0.9161781946072685, "calib/step_q_w_n": 853.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2379.0, "completions/max_terminated_length": 2379.0, "completions/mean_length": 774.71484375, "completions/mean_terminated_length": 796.4939575195312, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.016, "grad_norm": 0.006958819925785065, "kl": 0.003179311752319336, "learning_rate": 3.7500000000000005e-06, "loss": -0.0364, "num_tokens": 4707794.0, "reward": 0.8193666934967041, "reward_std": 0.1843802034854889, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.6969516277313232, "rewards/final_brier_reward_step": 0.6238129138946533, "rewards/format_reward_step": 0.97265625, "step": 15 }, { "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5129361324659965, "calib/avg_num_step_conf": 8.22265625, "calib/ece": 0.35505394190871375, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.991701244813278, "calib/gap": 0.012177927261975197, "calib/mean_conf": 0.9857593360995851, "calib/mu_c": 0.9902565789473684, "calib/mu_w": 0.9780786516853932, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35505394190871375, "calib/std_conf": 0.06391215578941556, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.912226787181594, "calib/step_q_c_n": 1217.0, "calib/step_q_gap": 0.011711021415828138, "calib/step_q_w": 0.9005157657657659, "calib/step_q_w_n": 888.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3068.0, "completions/max_terminated_length": 3068.0, "completions/mean_length": 934.87890625, "completions/mean_terminated_length": 968.943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 511.0, "epoch": 0.017066666666666667, "grad_norm": 0.005922108888626099, "kl": 0.003998517990112305, "learning_rate": 4.000000000000001e-06, "loss": -0.0454, "num_tokens": 5055971.0, "reward": 0.7862738370895386, "reward_std": 0.22128784656524658, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.6605606079101562, "rewards/final_brier_reward_step": 0.6049558520317078, "rewards/format_reward_step": 0.94140625, "step": 16 }, { "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.4801462904911181, "calib/avg_num_step_conf": 8.56640625, "calib/ece": 0.2651666666666668, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0003970741901777597, "calib/mean_conf": 0.9901666666666668, "calib/mu_c": 0.9900574712643677, "calib/mu_w": 0.9904545454545455, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2651666666666668, "calib/std_conf": 0.001280190957978102, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9115190735694824, "calib/step_q_c_n": 1468.0, "calib/step_q_gap": 0.009091487362585804, "calib/step_q_w": 0.9024275862068966, "calib/step_q_w_n": 725.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 881.22265625, "completions/mean_terminated_length": 917.044677734375, "completions/min_length": 0.0, "completions/min_terminated_length": 442.0, "epoch": 0.018133333333333335, "grad_norm": 0.006506393197923899, "kl": 0.006697654724121094, "learning_rate": 4.25e-06, "loss": -0.0019, "num_tokens": 5385092.0, "reward": 0.8756923675537109, "reward_std": 0.2749893069267273, "rewards/accuracy_reward_step": 0.6796875, "rewards/asymmetric_l2_reward": 0.743429958820343, "rewards/final_brier_reward_step": 0.6845171451568604, "rewards/format_reward_step": 0.9375, "step": 17 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.49460734748904617, "calib/avg_num_step_conf": 8.52734375, "calib/ece": 0.4615122950819672, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00011358274351180864, "calib/mean_conf": 0.9902008196721311, "calib/mu_c": 0.9901472868217055, "calib/mu_w": 0.9902608695652173, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4615122950819672, "calib/std_conf": 0.0013895981198515555, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9050502164502165, "calib/step_q_c_n": 1155.0, "calib/step_q_gap": -0.00048480300503650486, "calib/step_q_w": 0.905535019455253, "calib/step_q_w_n": 1028.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2863.0, "completions/max_terminated_length": 2863.0, "completions/mean_length": 878.26953125, "completions/mean_terminated_length": 917.7020263671875, "completions/min_length": 0.0, "completions/min_terminated_length": 496.0, "epoch": 0.0192, "grad_norm": 0.006810254883021116, "kl": 0.009805679321289062, "learning_rate": 4.5e-06, "loss": -0.0829, "num_tokens": 5720649.0, "reward": 0.7003229856491089, "reward_std": 0.2627100646495819, "rewards/accuracy_reward_step": 0.50390625, "rewards/asymmetric_l2_reward": 0.5975357294082642, "rewards/final_brier_reward_step": 0.512485146522522, "rewards/format_reward_step": 0.94921875, "step": 18 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5020604395604397, "calib/avg_num_step_conf": 8.515625, "calib/ece": 0.4044422310756972, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 8.797749869160487e-05, "calib/mean_conf": 0.9900996015936255, "calib/mu_c": 0.9901360544217686, "calib/mu_w": 0.990048076923077, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4044422310756972, "calib/std_conf": 0.0009415380317208354, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9119737919737919, "calib/step_q_c_n": 1221.0, "calib/step_q_gap": 0.00832937070163342, "calib/step_q_w": 0.9036444212721585, "calib/step_q_w_n": 959.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2840.0, "completions/max_terminated_length": 2840.0, "completions/mean_length": 889.046875, "completions/mean_terminated_length": 906.7570190429688, "completions/min_length": 0.0, "completions/min_terminated_length": 488.0, "epoch": 0.020266666666666665, "grad_norm": 0.00690700626000762, "kl": 0.0130615234375, "learning_rate": 4.75e-06, "loss": -0.029, "num_tokens": 6053005.0, "reward": 0.7996513843536377, "reward_std": 0.1983795017004013, "rewards/accuracy_reward_step": 0.57421875, "rewards/asymmetric_l2_reward": 0.7061575651168823, "rewards/final_brier_reward_step": 0.5822077393531799, "rewards/format_reward_step": 0.98046875, "step": 19 }, { "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.5141586360266864, "calib/avg_num_step_conf": 8.92578125, "calib/ece": 0.3908691983122362, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.9957805907172996, "calib/gap": 0.0011315048183840881, "calib/mean_conf": 0.9900253164556961, "calib/mu_c": 0.9904788732394364, "calib/mu_w": 0.9893473684210523, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3908691983122362, "calib/std_conf": 0.006170519301011835, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9144237160120847, "calib/step_q_c_n": 1324.0, "calib/step_q_gap": 0.014782717052667427, "calib/step_q_w": 0.8996409989594173, "calib/step_q_w_n": 961.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2904.0, "completions/max_terminated_length": 2904.0, "completions/mean_length": 878.88671875, "completions/mean_terminated_length": 922.110595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 476.0, "epoch": 0.021333333333333333, "grad_norm": 0.006423440296202898, "kl": 0.017368316650390625, "learning_rate": 5e-06, "loss": -0.0475, "num_tokens": 6382872.0, "reward": 0.7497608661651611, "reward_std": 0.25458869338035583, "rewards/accuracy_reward_step": 0.55859375, "rewards/asymmetric_l2_reward": 0.6401803493499756, "rewards/final_brier_reward_step": 0.5624663829803467, "rewards/format_reward_step": 0.92578125, "step": 20 }, { "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.5082817337461301, "calib/avg_num_step_conf": 9.05859375, "calib/ece": 0.34897046413502086, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001698142414863213, "calib/mean_conf": 0.990320675105485, "calib/mu_c": 0.9903815789473684, "calib/mu_w": 0.9902117647058821, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34897046413502086, "calib/std_conf": 0.0017181442888759794, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9113709063214015, "calib/step_q_c_n": 1313.0, "calib/step_q_gap": -0.004332871014582684, "calib/step_q_w": 0.9157037773359842, "calib/step_q_w_n": 1006.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2425.0, "completions/max_terminated_length": 2425.0, "completions/mean_length": 862.90625, "completions/mean_terminated_length": 928.1680908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 486.0, "epoch": 0.0224, "grad_norm": 0.006447064224630594, "kl": 0.019985198974609375, "learning_rate": 4.9722222222222224e-06, "loss": -0.1212, "num_tokens": 6706736.0, "reward": 0.7912242412567139, "reward_std": 0.337002694606781, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.6783818006515503, "rewards/final_brier_reward_step": 0.600160539150238, "rewards/format_reward_step": 0.92578125, "step": 21 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4864224137931035, "calib/avg_num_step_conf": 9.453125, "calib/ece": 0.34248987854251, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.000250287356321377, "calib/mean_conf": 0.9902631578947367, "calib/mu_c": 0.990175, "calib/mu_w": 0.9904252873563214, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34248987854251, "calib/std_conf": 0.0015427704414423698, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9139300554016622, "calib/step_q_c_n": 1444.0, "calib/step_q_gap": -0.008908059352436215, "calib/step_q_w": 0.9228381147540984, "calib/step_q_w_n": 976.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2716.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 926.14453125, "completions/mean_terminated_length": 952.1806640625, "completions/min_length": 0.0, "completions/min_terminated_length": 554.0, "epoch": 0.023466666666666667, "grad_norm": 0.006909825373440981, "kl": 0.02556610107421875, "learning_rate": 4.944444444444445e-06, "loss": -0.0571, "num_tokens": 7045645.0, "reward": 0.8283920884132385, "reward_std": 0.2065667062997818, "rewards/accuracy_reward_step": 0.625, "rewards/asymmetric_l2_reward": 0.7074013948440552, "rewards/final_brier_reward_step": 0.631413996219635, "rewards/format_reward_step": 0.96484375, "step": 22 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.48486980999296275, "calib/avg_num_step_conf": 9.1875, "calib/ece": 0.3936090534979424, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002897959183671972, "calib/mean_conf": 0.9903168724279835, "calib/mu_c": 0.9902000000000001, "calib/mu_w": 0.9904897959183673, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3936090534979424, "calib/std_conf": 0.0017196525908183033, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9122642244738894, "calib/step_q_c_n": 1283.0, "calib/step_q_gap": -0.00529237047466069, "calib/step_q_w": 0.9175565949485501, "calib/step_q_w_n": 1069.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3040.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 991.2578125, "completions/mean_terminated_length": 1023.2338256835938, "completions/min_length": 0.0, "completions/min_terminated_length": 576.0, "epoch": 0.024533333333333334, "grad_norm": 0.006668829824775457, "kl": 0.026376724243164062, "learning_rate": 4.9166666666666665e-06, "loss": -0.0238, "num_tokens": 7403343.0, "reward": 0.7600979208946228, "reward_std": 0.28747719526290894, "rewards/accuracy_reward_step": 0.5703125, "rewards/asymmetric_l2_reward": 0.6435527801513672, "rewards/final_brier_reward_step": 0.5735179781913757, "rewards/format_reward_step": 0.9453125, "step": 23 }, { "calib/answer_extract_rate": 0.875, "calib/auroc": 0.5044955044955045, "calib/avg_num_step_conf": 9.6875, "calib/ece": 0.3985466367713004, "calib/final_conf_rate": 0.87109375, "calib/format_rate": 0.8671875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 8.010323010343434e-05, "calib/mean_conf": 0.9904748878923767, "calib/mu_c": 0.9905075757575759, "calib/mu_w": 0.9904274725274724, "calib/nonempty_final_conf_rate": 0.87109375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3985466367713004, "calib/std_conf": 0.0020874592149258808, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9131759180187873, "calib/step_q_c_n": 1171.0, "calib/step_q_gap": 0.003358500142545906, "calib/step_q_w": 0.9098174178762414, "calib/step_q_w_n": 1309.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 3019.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 991.3203125, "completions/mean_terminated_length": 1084.5213623046875, "completions/min_length": 0.0, "completions/min_terminated_length": 456.0, "epoch": 0.0256, "grad_norm": 0.006332829128950834, "kl": 0.027338027954101562, "learning_rate": 4.888888888888889e-06, "loss": -0.0698, "num_tokens": 7761633.0, "reward": 0.7012392282485962, "reward_std": 0.3621658682823181, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.6074740886688232, "rewards/final_brier_reward_step": 0.5184417963027954, "rewards/format_reward_step": 0.8671875, "step": 24 }, { "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.5003185981680606, "calib/avg_num_step_conf": 9.76171875, "calib/ece": 0.3333389830508474, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1469534050179142e-05, "calib/mean_conf": 0.9901186440677966, "calib/mu_c": 0.9901225806451612, "calib/mu_w": 0.990111111111111, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3333389830508474, "calib/std_conf": 0.0010469446387952413, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9102945791726105, "calib/step_q_c_n": 1402.0, "calib/step_q_gap": -0.00886950469247616, "calib/step_q_w": 0.9191640838650866, "calib/step_q_w_n": 1097.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 930.88671875, "completions/mean_terminated_length": 992.9458618164062, "completions/min_length": 0.0, "completions/min_terminated_length": 627.0, "epoch": 0.02666666666666667, "grad_norm": 0.00617326470091939, "kl": 0.03643798828125, "learning_rate": 4.861111111111111e-06, "loss": -0.0607, "num_tokens": 8103164.0, "reward": 0.7968409061431885, "reward_std": 0.26151537895202637, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.6765774488449097, "rewards/final_brier_reward_step": 0.6116355061531067, "rewards/format_reward_step": 0.921875, "step": 25 }, { "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.49639751552795025, "calib/avg_num_step_conf": 9.203125, "calib/ece": 0.30787711864406775, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -7.743271221494208e-05, "calib/mean_conf": 0.9900805084745762, "calib/mu_c": 0.990055900621118, "calib/mu_w": 0.990133333333333, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30787711864406775, "calib/std_conf": 0.0008720478989503826, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.910840395480226, "calib/step_q_c_n": 1416.0, "calib/step_q_gap": 0.001616991224906772, "calib/step_q_w": 0.9092234042553192, "calib/step_q_w_n": 940.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2894.0, "completions/max_terminated_length": 2894.0, "completions/mean_length": 961.359375, "completions/mean_terminated_length": 1025.4500732421875, "completions/min_length": 0.0, "completions/min_terminated_length": 586.0, "epoch": 0.027733333333333332, "grad_norm": 0.006155260372906923, "kl": 0.03830718994140625, "learning_rate": 4.833333333333333e-06, "loss": -0.0363, "num_tokens": 8454512.0, "reward": 0.8146965503692627, "reward_std": 0.18639928102493286, "rewards/accuracy_reward_step": 0.62890625, "rewards/asymmetric_l2_reward": 0.684640645980835, "rewards/final_brier_reward_step": 0.6345961093902588, "rewards/format_reward_step": 0.921875, "step": 26 }, { "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.48558532323820613, "calib/avg_num_step_conf": 9.10546875, "calib/ece": 0.4161898734177215, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00041438555620243456, "calib/mean_conf": 0.9900295358649789, "calib/mu_c": 0.9898529411764706, "calib/mu_w": 0.990267326732673, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4161898734177215, "calib/std_conf": 0.0024673336271401705, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9105679012345679, "calib/step_q_c_n": 1215.0, "calib/step_q_gap": 0.005031162883313289, "calib/step_q_w": 0.9055367383512546, "calib/step_q_w_n": 1116.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 957.65234375, "completions/mean_terminated_length": 1025.7698974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 581.0, "epoch": 0.0288, "grad_norm": 0.006110661197453737, "kl": 0.03726959228515625, "learning_rate": 4.805555555555556e-06, "loss": -0.0576, "num_tokens": 8804887.0, "reward": 0.7280486822128296, "reward_std": 0.2960751950740814, "rewards/accuracy_reward_step": 0.53125, "rewards/asymmetric_l2_reward": 0.6258590817451477, "rewards/final_brier_reward_step": 0.5388320088386536, "rewards/format_reward_step": 0.92578125, "step": 27 }, { "calib/answer_extract_rate": 0.890625, "calib/auroc": 0.5021139705882354, "calib/avg_num_step_conf": 9.41796875, "calib/ece": 0.28837280701754386, "calib/final_conf_rate": 0.890625, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 7.647058823556652e-05, "calib/mean_conf": 0.9901271929824561, "calib/mu_c": 0.9901500000000001, "calib/mu_w": 0.9900735294117645, "calib/nonempty_final_conf_rate": 0.890625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28837280701754386, "calib/std_conf": 0.0009984887780795314, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9067275985663082, "calib/step_q_c_n": 1395.0, "calib/step_q_gap": -0.008336377811644535, "calib/step_q_w": 0.9150639763779528, "calib/step_q_w_n": 1016.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 2781.0, "completions/max_terminated_length": 2781.0, "completions/mean_length": 942.27734375, "completions/mean_terminated_length": 1035.2918701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 611.0, "epoch": 0.029866666666666666, "grad_norm": 0.005902933422476053, "kl": 0.036403656005859375, "learning_rate": 4.777777777777778e-06, "loss": -0.1197, "num_tokens": 9153054.0, "reward": 0.8076508045196533, "reward_std": 0.25794512033462524, "rewards/accuracy_reward_step": 0.625, "rewards/asymmetric_l2_reward": 0.681990921497345, "rewards/final_brier_reward_step": 0.6301857233047485, "rewards/format_reward_step": 0.890625, "step": 28 }, { "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.4990469208211144, "calib/avg_num_step_conf": 8.796875, "calib/ece": 0.46025213675213683, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1436950146603841e-05, "calib/mean_conf": 0.9901666666666668, "calib/mu_c": 0.9901612903225805, "calib/mu_w": 0.9901727272727271, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.46025213675213683, "calib/std_conf": 0.0012650799778778229, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9085794920037629, "calib/step_q_c_n": 1063.0, "calib/step_q_gap": -0.006252299417599749, "calib/step_q_w": 0.9148317914213626, "calib/step_q_w_n": 1189.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 1021.19921875, "completions/mean_terminated_length": 1098.432861328125, "completions/min_length": 0.0, "completions/min_terminated_length": 611.0, "epoch": 0.030933333333333334, "grad_norm": 0.005722293630242348, "kl": 0.037353515625, "learning_rate": 4.75e-06, "loss": -0.0728, "num_tokens": 9521609.0, "reward": 0.6883062124252319, "reward_std": 0.24335990846157074, "rewards/accuracy_reward_step": 0.484375, "rewards/asymmetric_l2_reward": 0.6041944026947021, "rewards/final_brier_reward_step": 0.4927304685115814, "rewards/format_reward_step": 0.9140625, "step": 29 }, { "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.5036021984551395, "calib/avg_num_step_conf": 9.14453125, "calib/ece": 0.41129361702127665, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 4.686571598344624e-05, "calib/mean_conf": 0.9900170212765957, "calib/mu_c": 0.9900367647058824, "calib/mu_w": 0.989989898989899, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.41129361702127665, "calib/std_conf": 0.0009361121838881834, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9080051369863015, "calib/step_q_c_n": 1168.0, "calib/step_q_gap": 0.004070780635065274, "calib/step_q_w": 0.9039343563512362, "calib/step_q_w_n": 1173.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2745.0, "completions/max_terminated_length": 2745.0, "completions/mean_length": 981.59765625, "completions/mean_terminated_length": 1060.2911376953125, "completions/min_length": 0.0, "completions/min_terminated_length": 636.0, "epoch": 0.032, "grad_norm": 0.005487215239554644, "kl": 0.039276123046875, "learning_rate": 4.722222222222222e-06, "loss": -0.1298, "num_tokens": 9879882.0, "reward": 0.7259608507156372, "reward_std": 0.3174560070037842, "rewards/accuracy_reward_step": 0.53515625, "rewards/asymmetric_l2_reward": 0.6232558488845825, "rewards/final_brier_reward_step": 0.5388221740722656, "rewards/format_reward_step": 0.9140625, "step": 30 }, { "calib/answer_extract_rate": 0.875, "calib/auroc": 0.4976919339164237, "calib/avg_num_step_conf": 8.4296875, "calib/ece": 0.4276160714285714, "calib/final_conf_rate": 0.875, "calib/format_rate": 0.875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001383219954648185, "calib/mean_conf": 0.9901160714285714, "calib/mu_c": 0.9900555555555555, "calib/mu_w": 0.9901938775510203, "calib/nonempty_final_conf_rate": 0.875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4276160714285714, "calib/std_conf": 0.001914444789499249, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.906287184284378, "calib/step_q_c_n": 1069.0, "calib/step_q_gap": 0.0062476985176195265, "calib/step_q_w": 0.9000394857667585, "calib/step_q_w_n": 1089.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2950.0, "completions/max_terminated_length": 2950.0, "completions/mean_length": 1086.8125, "completions/mean_terminated_length": 1173.94091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 640.0, "epoch": 0.03306666666666667, "grad_norm": 0.0050511229783296585, "kl": 0.032291412353515625, "learning_rate": 4.694444444444445e-06, "loss": 0.0117, "num_tokens": 10264018.0, "reward": 0.6780734658241272, "reward_std": 0.25499579310417175, "rewards/accuracy_reward_step": 0.4921875, "rewards/asymmetric_l2_reward": 0.5831027626991272, "rewards/final_brier_reward_step": 0.4996066093444824, "rewards/format_reward_step": 0.875, "step": 31 }, { "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.4933823529411765, "calib/avg_num_step_conf": 8.265625, "calib/ece": 0.4012640692640692, "calib/final_conf_rate": 0.90234375, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00013986068111493388, "calib/mean_conf": 0.990008658008658, "calib/mu_c": 0.9900661764705883, "calib/mu_w": 0.9899263157894733, "calib/nonempty_final_conf_rate": 0.90234375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4012640692640692, "calib/std_conf": 0.0024335165832523125, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9063417611159548, "calib/step_q_c_n": 1147.0, "calib/step_q_gap": 0.0021353627671416575, "calib/step_q_w": 0.9042063983488131, "calib/step_q_w_n": 969.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3048.0, "completions/max_terminated_length": 3048.0, "completions/mean_length": 991.8203125, "completions/mean_terminated_length": 1053.5518798828125, "completions/min_length": 0.0, "completions/min_terminated_length": 593.0, "epoch": 0.034133333333333335, "grad_norm": 0.005195594392716885, "kl": 0.04204559326171875, "learning_rate": 4.666666666666667e-06, "loss": -0.051, "num_tokens": 10624628.0, "reward": 0.714741587638855, "reward_std": 0.2588036060333252, "rewards/accuracy_reward_step": 0.53125, "rewards/asymmetric_l2_reward": 0.6041333675384521, "rewards/final_brier_reward_step": 0.5386311411857605, "rewards/format_reward_step": 0.90234375, "step": 32 }, { "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.4734042553191489, "calib/avg_num_step_conf": 8.2734375, "calib/ece": 0.3802282157676349, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0004787234042552546, "calib/mean_conf": 0.9901867219917013, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.990478723404255, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3802282157676349, "calib/std_conf": 0.0012828222102560924, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.906288998357964, "calib/step_q_c_n": 1218.0, "calib/step_q_gap": 0.002572331691297225, "calib/step_q_w": 0.9037166666666667, "calib/step_q_w_n": 900.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2737.0, "completions/max_terminated_length": 2737.0, "completions/mean_length": 908.328125, "completions/mean_terminated_length": 956.9217529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 527.0, "epoch": 0.0352, "grad_norm": 0.005166413728147745, "kl": 0.0454559326171875, "learning_rate": 4.638888888888889e-06, "loss": -0.0887, "num_tokens": 10964032.0, "reward": 0.7751585245132446, "reward_std": 0.2357899397611618, "rewards/accuracy_reward_step": 0.57421875, "rewards/asymmetric_l2_reward": 0.6716193556785583, "rewards/final_brier_reward_step": 0.5771350860595703, "rewards/format_reward_step": 0.93359375, "step": 33 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5082296266559614, "calib/avg_num_step_conf": 8.234375, "calib/ece": 0.41906437246963557, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003461394353003744, "calib/mean_conf": 0.9899145748987853, "calib/mu_c": 0.9900631205673758, "calib/mu_w": 0.9897169811320754, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41906437246963557, "calib/std_conf": 0.0021831221656469823, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9054787322768975, "calib/step_q_c_n": 1199.0, "calib/step_q_gap": 0.0013093153352032871, "calib/step_q_w": 0.9041694169416942, "calib/step_q_w_n": 909.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2385.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 924.50390625, "completions/mean_terminated_length": 942.9203491210938, "completions/min_length": 0.0, "completions/min_terminated_length": 604.0, "epoch": 0.03626666666666667, "grad_norm": 0.006008944008499384, "kl": 0.046756744384765625, "learning_rate": 4.611111111111112e-06, "loss": -0.01, "num_tokens": 11305817.0, "reward": 0.7865276336669922, "reward_std": 0.3068256378173828, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.7107362151145935, "rewards/final_brier_reward_step": 0.5591940879821777, "rewards/format_reward_step": 0.96484375, "step": 34 }, { "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.5233333333333333, "calib/avg_num_step_conf": 7.85546875, "calib/ece": 0.4155617021276594, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0007481481481481644, "calib/mean_conf": 0.9900297872340424, "calib/mu_c": 0.990348148148148, "calib/mu_w": 0.9895999999999998, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4155617021276594, "calib/std_conf": 0.002948343559011518, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9087988980716254, "calib/step_q_c_n": 1089.0, "calib/step_q_gap": 0.001499548830844466, "calib/step_q_w": 0.9072993492407809, "calib/step_q_w_n": 922.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 962.83984375, "completions/mean_terminated_length": 1027.0291748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 577.0, "epoch": 0.037333333333333336, "grad_norm": 0.004935343284159899, "kl": 0.04593658447265625, "learning_rate": 4.583333333333333e-06, "loss": -0.0778, "num_tokens": 11661560.0, "reward": 0.7069214582443237, "reward_std": 0.28172507882118225, "rewards/accuracy_reward_step": 0.52734375, "rewards/asymmetric_l2_reward": 0.5961191654205322, "rewards/final_brier_reward_step": 0.5310050249099731, "rewards/format_reward_step": 0.90625, "step": 35 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.47736757624398074, "calib/avg_num_step_conf": 8.796875, "calib/ece": 0.2724637096774194, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00043001605136394616, "calib/mean_conf": 0.9902056451612904, "calib/mu_c": 0.9900842696629213, "calib/mu_w": 0.9905142857142852, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2724637096774194, "calib/std_conf": 0.0016045702077219465, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9096727501573316, "calib/step_q_c_n": 1589.0, "calib/step_q_gap": -0.002705832044779921, "calib/step_q_w": 0.9123785822021115, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 958.34375, "completions/mean_terminated_length": 977.4342651367188, "completions/min_length": 0.0, "completions/min_terminated_length": 551.0, "epoch": 0.0384, "grad_norm": 0.041168417781591415, "kl": 0.22402572631835938, "learning_rate": 4.555555555555556e-06, "loss": -0.0138, "num_tokens": 12009608.0, "reward": 0.9012855291366577, "reward_std": 0.21603024005889893, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.769353985786438, "rewards/final_brier_reward_step": 0.7004045248031616, "rewards/format_reward_step": 0.96875, "step": 36 }, { "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.5127486892192774, "calib/avg_num_step_conf": 7.6796875, "calib/ece": 0.4942669491525423, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004327371974431893, "calib/mean_conf": 0.9900296610169491, "calib/mu_c": 0.9902478632478632, "calib/mu_w": 0.98981512605042, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4942669491525423, "calib/std_conf": 0.0024725547524240125, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9118057482656096, "calib/step_q_c_n": 1009.0, "calib/step_q_gap": 0.009423303124543647, "calib/step_q_w": 0.9023824451410659, "calib/step_q_w_n": 957.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2911.0, "completions/max_terminated_length": 2911.0, "completions/mean_length": 1007.19140625, "completions/mean_terminated_length": 1065.4586181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 613.0, "epoch": 0.039466666666666664, "grad_norm": 0.0046908557415008545, "kl": 0.04862213134765625, "learning_rate": 4.527777777777778e-06, "loss": -0.0334, "num_tokens": 12374545.0, "reward": 0.6690409183502197, "reward_std": 0.21596181392669678, "rewards/accuracy_reward_step": 0.45703125, "rewards/asymmetric_l2_reward": 0.595897912979126, "rewards/final_brier_reward_step": 0.46640270948410034, "rewards/format_reward_step": 0.921875, "step": 37 }, { "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.4941360722610722, "calib/avg_num_step_conf": 7.88671875, "calib/ece": 0.431148305084746, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00010460372960396924, "calib/mean_conf": 0.990470338983051, "calib/mu_c": 0.9904242424242423, "calib/mu_w": 0.9905288461538463, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.431148305084746, "calib/std_conf": 0.002415339724921821, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9140769911504424, "calib/step_q_c_n": 1130.0, "calib/step_q_gap": 0.007677666066077893, "calib/step_q_w": 0.9063993250843645, "calib/step_q_w_n": 889.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2742.0, "completions/max_terminated_length": 2742.0, "completions/mean_length": 929.5625, "completions/mean_terminated_length": 995.6820068359375, "completions/min_length": 0.0, "completions/min_terminated_length": 570.0, "epoch": 0.04053333333333333, "grad_norm": 0.005542919039726257, "kl": 0.06835556030273438, "learning_rate": 4.5e-06, "loss": -0.0711, "num_tokens": 12719401.0, "reward": 0.7114863395690918, "reward_std": 0.25867563486099243, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.613100528717041, "rewards/final_brier_reward_step": 0.5231534838676453, "rewards/format_reward_step": 0.91796875, "step": 38 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.4365874009178139, "calib/avg_num_step_conf": 8.25, "calib/ece": 0.41026666666666656, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0007673341677096879, "calib/mean_conf": 0.9905135802469135, "calib/mu_c": 0.9901914893617019, "calib/mu_w": 0.9909588235294116, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.41026666666666656, "calib/std_conf": 0.0040809328722435515, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9138510808646918, "calib/step_q_c_n": 1249.0, "calib/step_q_gap": 0.0021768050825363705, "calib/step_q_w": 0.9116742757821554, "calib/step_q_w_n": 863.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2830.0, "completions/max_terminated_length": 2830.0, "completions/mean_length": 998.80078125, "completions/mean_terminated_length": 1031.0201416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 520.0, "epoch": 0.0416, "grad_norm": 0.005000025033950806, "kl": 0.056304931640625, "learning_rate": 4.472222222222223e-06, "loss": -0.0509, "num_tokens": 13081182.0, "reward": 0.7604011297225952, "reward_std": 0.26943159103393555, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.6646355390548706, "rewards/final_brier_reward_step": 0.5577292442321777, "rewards/format_reward_step": 0.94140625, "step": 39 }, { "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5077826419289834, "calib/avg_num_step_conf": 8.2578125, "calib/ece": 0.4777125000000001, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001811548884721459, "calib/mean_conf": 0.9902125, "calib/mu_c": 0.9903008130081301, "calib/mu_w": 0.990119658119658, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4777125000000001, "calib/std_conf": 0.001868781354252018, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9149052319842053, "calib/step_q_c_n": 1013.0, "calib/step_q_gap": 0.004578256507365985, "calib/step_q_w": 0.9103269754768393, "calib/step_q_w_n": 1101.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2863.0, "completions/max_terminated_length": 2863.0, "completions/mean_length": 975.87890625, "completions/mean_terminated_length": 1015.5487670898438, "completions/min_length": 0.0, "completions/min_terminated_length": 541.0, "epoch": 0.042666666666666665, "grad_norm": 0.004450581502169371, "kl": 0.0665130615234375, "learning_rate": 4.444444444444444e-06, "loss": -0.0089, "num_tokens": 13437767.0, "reward": 0.6698815822601318, "reward_std": 0.24097877740859985, "rewards/accuracy_reward_step": 0.48046875, "rewards/asymmetric_l2_reward": 0.5667624473571777, "rewards/final_brier_reward_step": 0.48940688371658325, "rewards/format_reward_step": 0.9375, "step": 40 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.515946783305996, "calib/avg_num_step_conf": 8.76171875, "calib/ece": 0.23104081632653062, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00031802442135930864, "calib/mean_conf": 0.9902244897959184, "calib/mu_c": 0.9903010752688172, "calib/mu_w": 0.9899830508474579, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23104081632653062, "calib/std_conf": 0.0016814146876772244, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9165045262522632, "calib/step_q_c_n": 1657.0, "calib/step_q_gap": 0.00827756379492528, "calib/step_q_w": 0.9082269624573379, "calib/step_q_w_n": 586.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2973.0, "completions/max_terminated_length": 2973.0, "completions/mean_length": 935.36328125, "completions/mean_terminated_length": 953.9960327148438, "completions/min_length": 0.0, "completions/min_terminated_length": 535.0, "epoch": 0.04373333333333333, "grad_norm": 0.09011942893266678, "kl": 0.38977813720703125, "learning_rate": 4.416666666666667e-06, "loss": -0.0148, "num_tokens": 13784468.0, "reward": 0.9118338823318481, "reward_std": 0.25103092193603516, "rewards/accuracy_reward_step": 0.7265625, "rewards/asymmetric_l2_reward": 0.7605504989624023, "rewards/final_brier_reward_step": 0.7271796464920044, "rewards/format_reward_step": 0.953125, "step": 41 }, { "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.5044742729306487, "calib/avg_num_step_conf": 9.09375, "calib/ece": 0.36661087467081876, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.99581589958159, "calib/gap": 0.0012255885233013464, "calib/mean_conf": 0.990041837013915, "calib/mu_c": 0.9905033557046978, "calib/mu_w": 0.9892777671813965, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36661087467081876, "calib/std_conf": 0.0065371684647353365, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9178921282798833, "calib/step_q_c_n": 1372.0, "calib/step_q_gap": 0.002306356216440375, "calib/step_q_w": 0.9155857720634429, "calib/step_q_w_n": 956.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2878.0, "completions/max_terminated_length": 2878.0, "completions/mean_length": 883.82421875, "completions/mean_terminated_length": 931.1069946289062, "completions/min_length": 0.0, "completions/min_terminated_length": 531.0, "epoch": 0.0448, "grad_norm": 0.004536244552582502, "kl": 0.08171844482421875, "learning_rate": 4.388888888888889e-06, "loss": -0.0463, "num_tokens": 14115095.0, "reward": 0.7774260640144348, "reward_std": 0.20005618035793304, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.6669763922691345, "rewards/final_brier_reward_step": 0.585532009601593, "rewards/format_reward_step": 0.9296875, "step": 42 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5237985567326885, "calib/avg_num_step_conf": 9.2734375, "calib/ece": 0.30875510204081646, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00041839398126841676, "calib/mean_conf": 0.9903877551020409, "calib/mu_c": 0.9905209580838322, "calib/mu_w": 0.9901025641025638, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30875510204081646, "calib/std_conf": 0.001809457315539576, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9154681262073407, "calib/step_q_c_n": 1553.0, "calib/step_q_gap": 0.0038030835764029103, "calib/step_q_w": 0.9116650426309378, "calib/step_q_w_n": 821.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3019.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 960.828125, "completions/mean_terminated_length": 987.8392944335938, "completions/min_length": 0.0, "completions/min_terminated_length": 591.0, "epoch": 0.04586666666666667, "grad_norm": 0.0043022241443395615, "kl": 0.07738494873046875, "learning_rate": 4.361111111111112e-06, "loss": 0.0064, "num_tokens": 14466291.0, "reward": 0.8499053120613098, "reward_std": 0.2604464888572693, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.7243393063545227, "rewards/final_brier_reward_step": 0.6543775796890259, "rewards/format_reward_step": 0.953125, "step": 43 }, { "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.5004680973642518, "calib/avg_num_step_conf": 9.03515625, "calib/ece": 0.43760759493670887, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.329108454577014e-05, "calib/mean_conf": 0.9903502109704642, "calib/mu_c": 0.9903740458015268, "calib/mu_w": 0.990320754716981, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43760759493670887, "calib/std_conf": 0.0017883029334125153, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.91791894127378, "calib/step_q_c_n": 1209.0, "calib/step_q_gap": 0.007094665911461129, "calib/step_q_w": 0.9108242753623189, "calib/step_q_w_n": 1104.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2998.0, "completions/max_terminated_length": 2998.0, "completions/mean_length": 933.0859375, "completions/mean_terminated_length": 978.975341796875, "completions/min_length": 0.0, "completions/min_terminated_length": 582.0, "epoch": 0.046933333333333334, "grad_norm": 0.004361076280474663, "kl": 0.08103179931640625, "learning_rate": 4.333333333333334e-06, "loss": -0.0364, "num_tokens": 14811481.0, "reward": 0.7002480030059814, "reward_std": 0.22973665595054626, "rewards/accuracy_reward_step": 0.51171875, "rewards/asymmetric_l2_reward": 0.598037838935852, "rewards/final_brier_reward_step": 0.5157393217086792, "rewards/format_reward_step": 0.921875, "step": 44 }, { "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5265525565064736, "calib/avg_num_step_conf": 10.54296875, "calib/ece": 0.3753968055555554, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.9958333333333333, "calib/gap": 0.0076378245922025245, "calib/mean_conf": 0.9878968055555556, "calib/mu_c": 0.9908564625850338, "calib/mu_w": 0.9832186379928313, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3753968055555554, "calib/std_conf": 0.042494877076636975, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.918011901504788, "calib/step_q_c_n": 1462.0, "calib/step_q_gap": 0.007727342086841005, "calib/step_q_w": 0.910284559417947, "calib/step_q_w_n": 1237.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2732.0, "completions/max_terminated_length": 2732.0, "completions/mean_length": 941.203125, "completions/mean_terminated_length": 987.4917602539062, "completions/min_length": 0.0, "completions/min_terminated_length": 534.0, "epoch": 0.048, "grad_norm": 0.004294428043067455, "kl": 0.08637237548828125, "learning_rate": 4.305555555555556e-06, "loss": -0.0266, "num_tokens": 15157477.0, "reward": 0.7546592354774475, "reward_std": 0.2604624629020691, "rewards/accuracy_reward_step": 0.57421875, "rewards/asymmetric_l2_reward": 0.6334857940673828, "rewards/final_brier_reward_step": 0.5766139626502991, "rewards/format_reward_step": 0.921875, "step": 45 }, { "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.48762157382847043, "calib/avg_num_step_conf": 10.15625, "calib/ece": 0.49259656652360534, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00024506336575280674, "calib/mean_conf": 0.9904506437768242, "calib/mu_c": 0.9903275862068965, "calib/mu_w": 0.9905726495726493, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.49259656652360534, "calib/std_conf": 0.0022290343214388304, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9128881685575364, "calib/step_q_c_n": 1234.0, "calib/step_q_gap": -0.0026799134336789043, "calib/step_q_w": 0.9155680819912153, "calib/step_q_w_n": 1366.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 952.4609375, "completions/mean_terminated_length": 1011.7427978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 524.0, "epoch": 0.04906666666666667, "grad_norm": 0.003909003920853138, "kl": 0.08541107177734375, "learning_rate": 4.277777777777778e-06, "loss": -0.0688, "num_tokens": 15506075.0, "reward": 0.6511839628219604, "reward_std": 0.23548802733421326, "rewards/accuracy_reward_step": 0.453125, "rewards/asymmetric_l2_reward": 0.5727441310882568, "rewards/final_brier_reward_step": 0.45774880051612854, "rewards/format_reward_step": 0.90625, "step": 46 }, { "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.47734067663257274, "calib/avg_num_step_conf": 9.25, "calib/ece": 0.3367299578059072, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00041864673485447756, "calib/mean_conf": 0.9907383966244726, "calib/mu_c": 0.9905935483870968, "calib/mu_w": 0.9910121951219513, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3367299578059072, "calib/std_conf": 0.0028222896641156934, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9156944444444445, "calib/step_q_c_n": 1512.0, "calib/step_q_gap": 0.007248182762201516, "calib/step_q_w": 0.908446261682243, "calib/step_q_w_n": 856.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 952.6484375, "completions/mean_terminated_length": 1003.6131591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 608.0, "epoch": 0.050133333333333335, "grad_norm": 0.004438928794115782, "kl": 0.09577178955078125, "learning_rate": 4.25e-06, "loss": -0.0408, "num_tokens": 15855929.0, "reward": 0.7888574600219727, "reward_std": 0.22867697477340698, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.6705582737922668, "rewards/final_brier_reward_step": 0.6032503247261047, "rewards/format_reward_step": 0.9140625, "step": 47 }, { "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.4854422903203391, "calib/avg_num_step_conf": 10.80859375, "calib/ece": 0.4783245833333335, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00014048363560548172, "calib/mean_conf": 0.9908245833333335, "calib/mu_c": 0.9907560975609756, "calib/mu_w": 0.9908965811965811, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4783245833333335, "calib/std_conf": 0.003715297743975429, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9193959881129272, "calib/step_q_c_n": 1346.0, "calib/step_q_gap": 0.011938352644946915, "calib/step_q_w": 0.9074576354679803, "calib/step_q_w_n": 1421.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2835.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 943.671875, "completions/mean_terminated_length": 986.040771484375, "completions/min_length": 0.0, "completions/min_terminated_length": 606.0, "epoch": 0.0512, "grad_norm": 0.004192838445305824, "kl": 0.10528564453125, "learning_rate": 4.222222222222223e-06, "loss": -0.0329, "num_tokens": 16201197.0, "reward": 0.6844377517700195, "reward_std": 0.32408520579338074, "rewards/accuracy_reward_step": 0.48046875, "rewards/asymmetric_l2_reward": 0.601270854473114, "rewards/final_brier_reward_step": 0.4847921133041382, "rewards/format_reward_step": 0.93359375, "step": 48 }, { "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.4895036678632746, "calib/avg_num_step_conf": 11.13671875, "calib/ece": 0.35335035460992914, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.9872340425531915, "calib/gap": 0.0024480255970035225, "calib/mean_conf": 0.9845560283687944, "calib/mu_c": 0.9854519015659954, "calib/mu_w": 0.9830038759689919, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3519319148936171, "calib/std_conf": 0.06066803876025451, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9165114773396116, "calib/step_q_c_n": 1699.0, "calib/step_q_gap": 0.01334047039516717, "calib/step_q_w": 0.9031710069444444, "calib/step_q_w_n": 1152.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2891.0, "completions/max_terminated_length": 2891.0, "completions/mean_length": 890.3984375, "completions/mean_terminated_length": 949.7583618164062, "completions/min_length": 0.0, "completions/min_terminated_length": 527.0, "epoch": 0.05226666666666667, "grad_norm": 0.0043127527460455894, "kl": 0.100250244140625, "learning_rate": 4.194444444444445e-06, "loss": -0.0883, "num_tokens": 16533675.0, "reward": 0.7699627876281738, "reward_std": 0.2641654908657074, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.6509315967559814, "rewards/final_brier_reward_step": 0.5897751450538635, "rewards/format_reward_step": 0.9140625, "step": 49 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5242763772175536, "calib/avg_num_step_conf": 10.9375, "calib/ece": 0.3634918032786886, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0007050204697266471, "calib/mean_conf": 0.9905409836065574, "calib/mu_c": 0.9908039215686276, "calib/mu_w": 0.990098901098901, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3634918032786886, "calib/std_conf": 0.003249583935495379, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9180515151515152, "calib/step_q_c_n": 1650.0, "calib/step_q_gap": 0.004113254281949974, "calib/step_q_w": 0.9139382608695652, "calib/step_q_w_n": 1150.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2566.0, "completions/max_terminated_length": 2566.0, "completions/mean_length": 905.09765625, "completions/mean_terminated_length": 926.820068359375, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.05333333333333334, "grad_norm": 0.003878743853420019, "kl": 0.1117095947265625, "learning_rate": 4.166666666666667e-06, "loss": -0.0217, "num_tokens": 16870740.0, "reward": 0.7831500172615051, "reward_std": 0.28436478972435, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.6562309265136719, "rewards/final_brier_reward_step": 0.6006940603256226, "rewards/format_reward_step": 0.94921875, "step": 50 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.500757460990759, "calib/avg_num_step_conf": 10.390625, "calib/ece": 0.3277983539094652, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00015308286623205714, "calib/mean_conf": 0.9903497942386833, "calib/mu_c": 0.9902981366459629, "calib/mu_w": 0.990451219512195, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3277983539094652, "calib/std_conf": 0.00372692764480295, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9127832009080591, "calib/step_q_c_n": 1762.0, "calib/step_q_gap": -0.0003715875106490296, "calib/step_q_w": 0.9131547884187081, "calib/step_q_w_n": 898.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2950.0, "completions/max_terminated_length": 2950.0, "completions/mean_length": 875.6015625, "completions/mean_terminated_length": 911.195068359375, "completions/min_length": 0.0, "completions/min_terminated_length": 550.0, "epoch": 0.0544, "grad_norm": 0.004427890758961439, "kl": 0.106109619140625, "learning_rate": 4.138888888888889e-06, "loss": -0.0402, "num_tokens": 17204190.0, "reward": 0.8117591142654419, "reward_std": 0.2127530574798584, "rewards/accuracy_reward_step": 0.62890625, "rewards/asymmetric_l2_reward": 0.6729713678359985, "rewards/final_brier_reward_step": 0.6349219083786011, "rewards/format_reward_step": 0.94921875, "step": 51 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.48323471400394485, "calib/avg_num_step_conf": 10.9140625, "calib/ece": 0.3018319672131149, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9959016393442623, "calib/gap": -0.006114635108481092, "calib/mean_conf": 0.9862581967213114, "calib/mu_c": 0.9843786982248521, "calib/mu_w": 0.9904933333333332, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2977336065573772, "calib/std_conf": 0.06331118609908344, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9123175487465182, "calib/step_q_c_n": 1795.0, "calib/step_q_gap": -3.883797136672307e-06, "calib/step_q_w": 0.9123214325436548, "calib/step_q_w_n": 999.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2795.0, "completions/max_terminated_length": 2795.0, "completions/mean_length": 898.99609375, "completions/mean_terminated_length": 927.9959106445312, "completions/min_length": 0.0, "completions/min_terminated_length": 545.0, "epoch": 0.055466666666666664, "grad_norm": 0.004448126070201397, "kl": 0.111572265625, "learning_rate": 4.111111111111111e-06, "loss": 0.0072, "num_tokens": 17542285.0, "reward": 0.8596779108047485, "reward_std": 0.25896918773651123, "rewards/accuracy_reward_step": 0.66015625, "rewards/asymmetric_l2_reward": 0.7349734306335449, "rewards/final_brier_reward_step": 0.661726176738739, "rewards/format_reward_step": 0.953125, "step": 52 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5007157464212679, "calib/avg_num_step_conf": 9.921875, "calib/ece": 0.34688142292490115, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": -0.0008140422631220945, "calib/mean_conf": 0.9899644268774704, "calib/mu_c": 0.9896748466257669, "calib/mu_w": 0.990488888888889, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3462885375494071, "calib/std_conf": 0.009443127003087047, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9152066167290887, "calib/step_q_c_n": 1602.0, "calib/step_q_gap": 0.002208748925250692, "calib/step_q_w": 0.912997867803838, "calib/step_q_w_n": 938.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 919.34375, "completions/mean_terminated_length": 930.2451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 493.0, "epoch": 0.05653333333333333, "grad_norm": 0.004339267965406179, "kl": 0.11077880859375, "learning_rate": 4.083333333333334e-06, "loss": -0.0096, "num_tokens": 17883461.0, "reward": 0.8446915149688721, "reward_std": 0.24097609519958496, "rewards/accuracy_reward_step": 0.63671875, "rewards/asymmetric_l2_reward": 0.7220233082771301, "rewards/final_brier_reward_step": 0.6431408524513245, "rewards/format_reward_step": 0.984375, "step": 53 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5237379332172811, "calib/avg_num_step_conf": 11.91796875, "calib/ece": 0.27575502008032127, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00048520335496160527, "calib/mean_conf": 0.9906144578313253, "calib/mu_c": 0.9907528089887639, "calib/mu_w": 0.9902676056338023, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.27575502008032127, "calib/std_conf": 0.0026684140498420606, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9163947105788423, "calib/step_q_c_n": 2004.0, "calib/step_q_gap": 0.006814958907400026, "calib/step_q_w": 0.9095797516714422, "calib/step_q_w_n": 1047.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 842.21875, "completions/mean_terminated_length": 862.4320678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 537.0, "epoch": 0.0576, "grad_norm": 0.004163231234997511, "kl": 0.1133880615234375, "learning_rate": 4.055555555555556e-06, "loss": -0.0272, "num_tokens": 18205301.0, "reward": 0.8736650943756104, "reward_std": 0.26025390625, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.7224922180175781, "rewards/final_brier_reward_step": 0.692806601524353, "rewards/format_reward_step": 0.96484375, "step": 54 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.49713242961418136, "calib/avg_num_step_conf": 10.42578125, "calib/ece": 0.44043333333333345, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00014805135557871552, "calib/mean_conf": 0.9906341365461848, "calib/mu_c": 0.9907007299270073, "calib/mu_w": 0.9905526785714286, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44043333333333345, "calib/std_conf": 0.0032175970974869994, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9174707412223667, "calib/step_q_c_n": 1538.0, "calib/step_q_gap": 0.005732191266575315, "calib/step_q_w": 0.9117385499557914, "calib/step_q_w_n": 1131.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2272.0, "completions/max_terminated_length": 2272.0, "completions/mean_length": 862.86328125, "completions/mean_terminated_length": 883.572021484375, "completions/min_length": 0.0, "completions/min_terminated_length": 498.0, "epoch": 0.058666666666666666, "grad_norm": 0.004341635387390852, "kl": 0.1074371337890625, "learning_rate": 4.027777777777779e-06, "loss": -0.0263, "num_tokens": 18534018.0, "reward": 0.7443338632583618, "reward_std": 0.19267994165420532, "rewards/accuracy_reward_step": 0.53515625, "rewards/asymmetric_l2_reward": 0.6437778472900391, "rewards/final_brier_reward_step": 0.5433272123336792, "rewards/format_reward_step": 0.97265625, "step": 55 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5393295718552937, "calib/avg_num_step_conf": 10.25390625, "calib/ece": 0.4578983739837398, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0009985396614667907, "calib/mean_conf": 0.9904186991869918, "calib/mu_c": 0.990885496183206, "calib/mu_w": 0.9898869565217392, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4578983739837398, "calib/std_conf": 0.0035635199916164025, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9186488388458831, "calib/step_q_c_n": 1421.0, "calib/step_q_gap": 0.005492692666481114, "calib/step_q_w": 0.913156146179402, "calib/step_q_w_n": 1204.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2633.0, "completions/max_terminated_length": 2633.0, "completions/mean_length": 868.015625, "completions/mean_terminated_length": 896.01611328125, "completions/min_length": 0.0, "completions/min_terminated_length": 563.0, "epoch": 0.05973333333333333, "grad_norm": 0.004599003586918116, "kl": 0.11109161376953125, "learning_rate": 4.000000000000001e-06, "loss": -0.0241, "num_tokens": 18863070.0, "reward": 0.7282382249832153, "reward_std": 0.27239811420440674, "rewards/accuracy_reward_step": 0.51171875, "rewards/asymmetric_l2_reward": 0.6412410140037537, "rewards/final_brier_reward_step": 0.5207042694091797, "rewards/format_reward_step": 0.9609375, "step": 56 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.48916912822843295, "calib/avg_num_step_conf": 10.65234375, "calib/ece": 0.3227213114754097, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.9959016393442623, "calib/gap": -0.0008485950162842038, "calib/mean_conf": 0.9907540983606556, "calib/mu_c": 0.9904723926380369, "calib/mu_w": 0.9913209876543211, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3227213114754097, "calib/std_conf": 0.00696809059083824, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9192886836027714, "calib/step_q_c_n": 1732.0, "calib/step_q_gap": 0.012469588125384345, "calib/step_q_w": 0.906819095477387, "calib/step_q_w_n": 995.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3027.0, "completions/max_terminated_length": 3027.0, "completions/mean_length": 854.36328125, "completions/mean_terminated_length": 881.92333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 564.0, "epoch": 0.0608, "grad_norm": 0.0046896119602024555, "kl": 0.10809326171875, "learning_rate": 3.972222222222223e-06, "loss": 0.0136, "num_tokens": 19188579.0, "reward": 0.8231363296508789, "reward_std": 0.21880272030830383, "rewards/accuracy_reward_step": 0.63671875, "rewards/asymmetric_l2_reward": 0.6887975931167603, "rewards/final_brier_reward_step": 0.641849935054779, "rewards/format_reward_step": 0.94140625, "step": 57 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5048640724946695, "calib/avg_num_step_conf": 10.06640625, "calib/ece": 0.4460243902439024, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -5.143923240968906e-05, "calib/mean_conf": 0.9907398373983739, "calib/mu_c": 0.9907164179104475, "calib/mu_w": 0.9907678571428572, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4460243902439024, "calib/std_conf": 0.0034999126548311047, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9133302611367129, "calib/step_q_c_n": 1302.0, "calib/step_q_gap": 0.000403986626908992, "calib/step_q_w": 0.9129262745098039, "calib/step_q_w_n": 1275.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2097.0, "completions/max_terminated_length": 2097.0, "completions/mean_length": 907.40625, "completions/mean_terminated_length": 936.6773681640625, "completions/min_length": 0.0, "completions/min_terminated_length": 497.0, "epoch": 0.06186666666666667, "grad_norm": 0.004239367786794901, "kl": 0.0998992919921875, "learning_rate": 3.944444444444445e-06, "loss": -0.0459, "num_tokens": 19527195.0, "reward": 0.7133985757827759, "reward_std": 0.27472320199012756, "rewards/accuracy_reward_step": 0.5234375, "rewards/asymmetric_l2_reward": 0.6040467619895935, "rewards/final_brier_reward_step": 0.5274378061294556, "rewards/format_reward_step": 0.953125, "step": 58 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.48307274841710807, "calib/avg_num_step_conf": 9.89453125, "calib/ece": 0.4240278884462152, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": 0.0011575138906835258, "calib/mean_conf": 0.9897649402390439, "calib/mu_c": 0.9902676056338026, "calib/mu_w": 0.989110091743119, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4240278884462152, "calib/std_conf": 0.009348313326382543, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9131354883081156, "calib/step_q_c_n": 1454.0, "calib/step_q_gap": -0.00026858954174535654, "calib/step_q_w": 0.913404077849861, "calib/step_q_w_n": 1079.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 857.40234375, "completions/mean_terminated_length": 871.011962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 490.0, "epoch": 0.06293333333333333, "grad_norm": 0.004450817126780748, "kl": 0.1063232421875, "learning_rate": 3.916666666666667e-06, "loss": -0.0098, "num_tokens": 19852938.0, "reward": 0.7629809379577637, "reward_std": 0.28226304054260254, "rewards/accuracy_reward_step": 0.5546875, "rewards/asymmetric_l2_reward": 0.6598451733589172, "rewards/final_brier_reward_step": 0.5598666667938232, "rewards/format_reward_step": 0.9765625, "step": 59 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4952478384265065, "calib/avg_num_step_conf": 9.59765625, "calib/ece": 0.43016895161290347, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00022657910368950596, "calib/mean_conf": 0.9906528225806454, "calib/mu_c": 0.9905532374100718, "calib/mu_w": 0.9907798165137613, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43016895161290347, "calib/std_conf": 0.0036146643668052265, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9191200987306065, "calib/step_q_c_n": 1418.0, "calib/step_q_gap": 0.007535883138691246, "calib/step_q_w": 0.9115842155919153, "calib/step_q_w_n": 1039.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2479.0, "completions/max_terminated_length": 2479.0, "completions/mean_length": 830.61328125, "completions/mean_terminated_length": 853.9638061523438, "completions/min_length": 0.0, "completions/min_terminated_length": 525.0, "epoch": 0.064, "grad_norm": 0.004765588324517012, "kl": 0.1033172607421875, "learning_rate": 3.88888888888889e-06, "loss": -0.0186, "num_tokens": 20174431.0, "reward": 0.7508342862129211, "reward_std": 0.2844465672969818, "rewards/accuracy_reward_step": 0.54296875, "rewards/asymmetric_l2_reward": 0.648601770401001, "rewards/final_brier_reward_step": 0.5507229566574097, "rewards/format_reward_step": 0.96875, "step": 60 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5230346576500422, "calib/avg_num_step_conf": 10.578125, "calib/ece": 0.3233754940711463, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004493519301215221, "calib/mean_conf": 0.9913596837944665, "calib/mu_c": 0.9915088757396451, "calib/mu_w": 0.9910595238095236, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3233754940711463, "calib/std_conf": 0.0035107225784080016, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9175910396323952, "calib/step_q_c_n": 1741.0, "calib/step_q_gap": 0.007756499818537987, "calib/step_q_w": 0.9098345398138572, "calib/step_q_w_n": 967.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1689.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 803.07421875, "completions/mean_terminated_length": 809.3976440429688, "completions/min_length": 0.0, "completions/min_terminated_length": 460.0, "epoch": 0.06506666666666666, "grad_norm": 0.004396671429276466, "kl": 0.1050567626953125, "learning_rate": 3.861111111111112e-06, "loss": 0.0029, "num_tokens": 20484082.0, "reward": 0.8529362082481384, "reward_std": 0.2431645393371582, "rewards/accuracy_reward_step": 0.66015625, "rewards/asymmetric_l2_reward": 0.7111063003540039, "rewards/final_brier_reward_step": 0.6658596992492676, "rewards/format_reward_step": 0.984375, "step": 61 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.45560005085176714, "calib/avg_num_step_conf": 10.02734375, "calib/ece": 0.4439281746031747, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0008107932875668356, "calib/mean_conf": 0.9915472222222224, "calib/mu_c": 0.9911804347826085, "calib/mu_w": 0.9919912280701754, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4439281746031747, "calib/std_conf": 0.0035269641918985748, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9174202823179792, "calib/step_q_c_n": 1346.0, "calib/step_q_gap": 2.7161924858742914e-05, "calib/step_q_w": 0.9173931203931205, "calib/step_q_w_n": 1221.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2473.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 835.9921875, "completions/mean_terminated_length": 845.9051513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 485.0, "epoch": 0.06613333333333334, "grad_norm": 0.004839341156184673, "kl": 0.10821533203125, "learning_rate": 3.833333333333334e-06, "loss": 0.0001, "num_tokens": 20805176.0, "reward": 0.7392706871032715, "reward_std": 0.31165140867233276, "rewards/accuracy_reward_step": 0.5390625, "rewards/asymmetric_l2_reward": 0.6277410984039307, "rewards/final_brier_reward_step": 0.5461127161979675, "rewards/format_reward_step": 0.984375, "step": 62 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5094806802632437, "calib/avg_num_step_conf": 8.9765625, "calib/ece": 0.399904761904762, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00022962142438298638, "calib/mean_conf": 0.9911746031746033, "calib/mu_c": 0.9912684563758388, "calib/mu_w": 0.9910388349514558, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.399904761904762, "calib/std_conf": 0.003223375350785793, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9177375000000001, "calib/step_q_c_n": 1440.0, "calib/step_q_gap": 0.010147756410256559, "calib/step_q_w": 0.9075897435897435, "calib/step_q_w_n": 858.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2837.0, "completions/max_terminated_length": 2837.0, "completions/mean_length": 866.95703125, "completions/mean_terminated_length": 873.783447265625, "completions/min_length": 0.0, "completions/min_terminated_length": 442.0, "epoch": 0.0672, "grad_norm": 0.004709714557975531, "kl": 0.096405029296875, "learning_rate": 3.8055555555555556e-06, "loss": 0.0354, "num_tokens": 21135757.0, "reward": 0.784890353679657, "reward_std": 0.2856467366218567, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.6720311641693115, "rewards/final_brier_reward_step": 0.5852494239807129, "rewards/format_reward_step": 0.98046875, "step": 63 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5216008771929825, "calib/avg_num_step_conf": 9.19140625, "calib/ece": 0.309864143426295, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00047931286549685037, "calib/mean_conf": 0.9911390438247013, "calib/mu_c": 0.9912918128654971, "calib/mu_w": 0.9908125000000002, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.309864143426295, "calib/std_conf": 0.0030717562948793975, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9159911111111112, "calib/step_q_c_n": 1575.0, "calib/step_q_gap": 0.00026231933733222856, "calib/step_q_w": 0.915728791773779, "calib/step_q_w_n": 778.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2331.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 829.42578125, "completions/mean_terminated_length": 845.9482421875, "completions/min_length": 0.0, "completions/min_terminated_length": 493.0, "epoch": 0.06826666666666667, "grad_norm": 0.004558868706226349, "kl": 0.10372161865234375, "learning_rate": 3.777777777777778e-06, "loss": -0.0184, "num_tokens": 21451866.0, "reward": 0.8665769100189209, "reward_std": 0.2548443377017975, "rewards/accuracy_reward_step": 0.66796875, "rewards/asymmetric_l2_reward": 0.7345287799835205, "rewards/final_brier_reward_step": 0.669718861579895, "rewards/format_reward_step": 0.9765625, "step": 64 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5004037267080745, "calib/avg_num_step_conf": 9.6015625, "calib/ece": 0.44124313725490194, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9921568627450981, "calib/gap": 0.0013813664596273867, "calib/mean_conf": 0.9902627450980392, "calib/mu_c": 0.9908857142857143, "calib/mu_w": 0.9895043478260869, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.44124313725490194, "calib/std_conf": 0.009060882543352867, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9190794357832219, "calib/step_q_c_n": 1347.0, "calib/step_q_gap": 0.002451172956939307, "calib/step_q_w": 0.9166282628262826, "calib/step_q_w_n": 1111.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 767.11328125, "completions/mean_terminated_length": 770.1216430664062, "completions/min_length": 0.0, "completions/min_terminated_length": 393.0, "epoch": 0.06933333333333333, "grad_norm": 0.004670554306358099, "kl": 0.105682373046875, "learning_rate": 3.7500000000000005e-06, "loss": -0.0159, "num_tokens": 21753271.0, "reward": 0.745876669883728, "reward_std": 0.22282391786575317, "rewards/accuracy_reward_step": 0.546875, "rewards/asymmetric_l2_reward": 0.6341006755828857, "rewards/final_brier_reward_step": 0.5514026880264282, "rewards/format_reward_step": 0.984375, "step": 65 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.48895463510848125, "calib/avg_num_step_conf": 9.0859375, "calib/ece": 0.5179024291497976, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002717948717949614, "calib/mean_conf": 0.9915866396761134, "calib/mu_c": 0.9914435897435897, "calib/mu_w": 0.9917153846153847, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5179024291497976, "calib/std_conf": 0.004608654906179873, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.917523611111111, "calib/step_q_c_n": 1080.0, "calib/step_q_gap": 0.008120721865525149, "calib/step_q_w": 0.9094028892455859, "calib/step_q_w_n": 1246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3037.0, "completions/max_terminated_length": 3037.0, "completions/mean_length": 859.48046875, "completions/mean_terminated_length": 887.2056274414062, "completions/min_length": 0.0, "completions/min_terminated_length": 485.0, "epoch": 0.0704, "grad_norm": 0.004522152245044708, "kl": 0.10164642333984375, "learning_rate": 3.7222222222222225e-06, "loss": -0.0549, "num_tokens": 22079650.0, "reward": 0.6742883920669556, "reward_std": 0.24997103214263916, "rewards/accuracy_reward_step": 0.45703125, "rewards/asymmetric_l2_reward": 0.5988451838493347, "rewards/final_brier_reward_step": 0.4653565287590027, "rewards/format_reward_step": 0.96484375, "step": 66 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4814592503668134, "calib/avg_num_step_conf": 8.80859375, "calib/ece": 0.38200398406374503, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00022695744964651432, "calib/mean_conf": 0.9915657370517929, "calib/mu_c": 0.9914771241830065, "calib/mu_w": 0.991704081632653, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.38200398406374503, "calib/std_conf": 0.004837463914705453, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9192009943181819, "calib/step_q_c_n": 1408.0, "calib/step_q_gap": 0.0034536507526565785, "calib/step_q_w": 0.9157473435655253, "calib/step_q_w_n": 847.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2144.0, "completions/max_terminated_length": 2144.0, "completions/mean_length": 826.7578125, "completions/mean_terminated_length": 836.561279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 496.0, "epoch": 0.07146666666666666, "grad_norm": 0.0044913021847605705, "kl": 0.099365234375, "learning_rate": 3.694444444444445e-06, "loss": 0.0045, "num_tokens": 22396308.0, "reward": 0.7945924401283264, "reward_std": 0.22359302639961243, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.6705037355422974, "rewards/final_brier_reward_step": 0.6038373708724976, "rewards/format_reward_step": 0.9765625, "step": 67 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.49789445979734176, "calib/avg_num_step_conf": 8.57421875, "calib/ece": 0.39864063745019934, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": -0.0008291354125539163, "calib/mean_conf": 0.9906725099601594, "calib/mu_c": 0.9903355704697987, "calib/mu_w": 0.9911647058823526, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39784382470119534, "calib/std_conf": 0.012954025404413391, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9185092524056256, "calib/step_q_c_n": 1351.0, "calib/step_q_gap": 0.00946245145775837, "calib/step_q_w": 0.9090468009478673, "calib/step_q_w_n": 844.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2807.0, "completions/max_terminated_length": 2807.0, "completions/mean_length": 836.859375, "completions/mean_terminated_length": 846.7826538085938, "completions/min_length": 0.0, "completions/min_terminated_length": 416.0, "epoch": 0.07253333333333334, "grad_norm": 0.004729957785457373, "kl": 0.0984649658203125, "learning_rate": 3.6666666666666666e-06, "loss": -0.0043, "num_tokens": 22714632.0, "reward": 0.788429319858551, "reward_std": 0.2118925005197525, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.6775600910186768, "rewards/final_brier_reward_step": 0.5883611440658569, "rewards/format_reward_step": 0.97265625, "step": 68 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.47797979797979806, "calib/avg_num_step_conf": 8.078125, "calib/ece": 0.44032202857142866, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0003832882154881645, "calib/mean_conf": 0.9913424367346939, "calib/mu_c": 0.991170348148148, "calib/mu_w": 0.9915536363636361, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44032202857142866, "calib/std_conf": 0.0036035280865406505, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9141814665523157, "calib/step_q_c_n": 1166.0, "calib/step_q_gap": 0.003857741496883338, "calib/step_q_w": 0.9103237250554324, "calib/step_q_w_n": 902.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2661.0, "completions/max_terminated_length": 2661.0, "completions/mean_length": 869.47265625, "completions/mean_terminated_length": 883.2738647460938, "completions/min_length": 0.0, "completions/min_terminated_length": 527.0, "epoch": 0.0736, "grad_norm": 0.004320142790675163, "kl": 0.09468841552734375, "learning_rate": 3.638888888888889e-06, "loss": -0.0006, "num_tokens": 23041713.0, "reward": 0.7292081117630005, "reward_std": 0.24731634557247162, "rewards/accuracy_reward_step": 0.52734375, "rewards/asymmetric_l2_reward": 0.627022922039032, "rewards/final_brier_reward_step": 0.5345181822776794, "rewards/format_reward_step": 0.95703125, "step": 69 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.49813928761297194, "calib/avg_num_step_conf": 7.99609375, "calib/ece": 0.4548939024390245, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00015917065390724794, "calib/mean_conf": 0.9914792682926831, "calib/mu_c": 0.9915530303030302, "calib/mu_w": 0.991393859649123, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4548939024390245, "calib/std_conf": 0.0041524330551632686, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9180130095403296, "calib/step_q_c_n": 1153.0, "calib/step_q_gap": 0.0100985800101282, "calib/step_q_w": 0.9079144295302014, "calib/step_q_w_n": 894.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 831.33984375, "completions/mean_terminated_length": 847.9004516601562, "completions/min_length": 0.0, "completions/min_terminated_length": 383.0, "epoch": 0.07466666666666667, "grad_norm": 0.004979654680937529, "kl": 0.0870513916015625, "learning_rate": 3.6111111111111115e-06, "loss": 0.007, "num_tokens": 23361528.0, "reward": 0.7205314636230469, "reward_std": 0.25079619884490967, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.6327803730964661, "rewards/final_brier_reward_step": 0.5153137445449829, "rewards/format_reward_step": 0.94921875, "step": 70 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5466309391543036, "calib/avg_num_step_conf": 7.71484375, "calib/ece": 0.4190948000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0008683027253120201, "calib/mean_conf": 0.9910948, "calib/mu_c": 0.9914664335664335, "calib/mu_w": 0.9905981308411215, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4190948000000001, "calib/std_conf": 0.0031562276470495625, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9149893195521103, "calib/step_q_c_n": 1161.0, "calib/step_q_gap": 0.0079745775373683, "calib/step_q_w": 0.907014742014742, "calib/step_q_w_n": 814.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2727.0, "completions/max_terminated_length": 2727.0, "completions/mean_length": 842.80859375, "completions/mean_terminated_length": 856.1865844726562, "completions/min_length": 0.0, "completions/min_terminated_length": 502.0, "epoch": 0.07573333333333333, "grad_norm": 0.004616872873157263, "kl": 0.08941650390625, "learning_rate": 3.5833333333333335e-06, "loss": 0.0087, "num_tokens": 23681695.0, "reward": 0.7680221796035767, "reward_std": 0.28707748651504517, "rewards/accuracy_reward_step": 0.55859375, "rewards/asymmetric_l2_reward": 0.6626472473144531, "rewards/final_brier_reward_step": 0.5663659572601318, "rewards/format_reward_step": 0.9765625, "step": 71 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.45953630796150485, "calib/avg_num_step_conf": 7.97265625, "calib/ece": 0.4894138339920948, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0007374828146481383, "calib/mean_conf": 0.991390118577075, "calib/mu_c": 0.9910228346456692, "calib/mu_w": 0.9917603174603173, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4894138339920948, "calib/std_conf": 0.005447240769577733, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9165731593662628, "calib/step_q_c_n": 1073.0, "calib/step_q_gap": 0.005960659366262888, "calib/step_q_w": 0.9106124999999999, "calib/step_q_w_n": 968.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2172.0, "completions/max_terminated_length": 2172.0, "completions/mean_length": 789.16796875, "completions/mean_terminated_length": 798.5256958007812, "completions/min_length": 0.0, "completions/min_terminated_length": 464.0, "epoch": 0.0768, "grad_norm": 0.004809282254427671, "kl": 0.097625732421875, "learning_rate": 3.555555555555556e-06, "loss": -0.0322, "num_tokens": 23988130.0, "reward": 0.7388637065887451, "reward_std": 0.25400206446647644, "rewards/accuracy_reward_step": 0.49609375, "rewards/asymmetric_l2_reward": 0.6775314807891846, "rewards/final_brier_reward_step": 0.5041021108627319, "rewards/format_reward_step": 0.984375, "step": 72 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4815950920245399, "calib/avg_num_step_conf": 7.9140625, "calib/ece": 0.34234183266932283, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.000287123535973266, "calib/mean_conf": 0.9917442231075698, "calib/mu_c": 0.9916435582822086, "calib/mu_w": 0.9919306818181819, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34234183266932283, "calib/std_conf": 0.003711937844814869, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9156873333333334, "calib/step_q_c_n": 1350.0, "calib/step_q_gap": 0.004211297830374883, "calib/step_q_w": 0.9114760355029585, "calib/step_q_w_n": 676.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2176.0, "completions/max_terminated_length": 2176.0, "completions/mean_length": 772.7890625, "completions/mean_terminated_length": 781.9525756835938, "completions/min_length": 0.0, "completions/min_terminated_length": 454.0, "epoch": 0.07786666666666667, "grad_norm": 0.0048720454797148705, "kl": 0.08669281005859375, "learning_rate": 3.5277777777777784e-06, "loss": 0.0042, "num_tokens": 24292996.0, "reward": 0.8364192247390747, "reward_std": 0.3165234923362732, "rewards/accuracy_reward_step": 0.640625, "rewards/asymmetric_l2_reward": 0.7072925567626953, "rewards/final_brier_reward_step": 0.6421083211898804, "rewards/format_reward_step": 0.9765625, "step": 73 }, { "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.48536348501664806, "calib/avg_num_step_conf": 7.81640625, "calib/ece": 0.4293219008264464, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.9958677685950413, "calib/gap": -0.0009002913429522197, "calib/mean_conf": 0.9913053719008266, "calib/mu_c": 0.9909110294117648, "calib/mu_w": 0.991811320754717, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4293219008264464, "calib/std_conf": 0.006998713877282736, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9165494362532525, "calib/step_q_c_n": 1153.0, "calib/step_q_gap": 0.008406275875894043, "calib/step_q_w": 0.9081431603773584, "calib/step_q_w_n": 848.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2647.0, "completions/max_terminated_length": 2647.0, "completions/mean_length": 807.8671875, "completions/mean_terminated_length": 833.9273681640625, "completions/min_length": 0.0, "completions/min_terminated_length": 348.0, "epoch": 0.07893333333333333, "grad_norm": 0.0050594923086464405, "kl": 0.0992584228515625, "learning_rate": 3.5e-06, "loss": -0.0046, "num_tokens": 24603738.0, "reward": 0.7477249503135681, "reward_std": 0.256036639213562, "rewards/accuracy_reward_step": 0.53125, "rewards/asymmetric_l2_reward": 0.6630828380584717, "rewards/final_brier_reward_step": 0.5378357172012329, "rewards/format_reward_step": 0.94140625, "step": 74 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4979323308270677, "calib/avg_num_step_conf": 8.22265625, "calib/ece": 0.29454183266932255, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -5.233082706768144e-05, "calib/mean_conf": 0.9917529880478086, "calib/mu_c": 0.9917371428571428, "calib/mu_w": 0.9917894736842104, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29454183266932255, "calib/std_conf": 0.0038501230171085607, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9169490274983233, "calib/step_q_c_n": 1491.0, "calib/step_q_gap": 0.007225900462492829, "calib/step_q_w": 0.9097231270358305, "calib/step_q_w_n": 614.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2305.0, "completions/max_terminated_length": 2305.0, "completions/mean_length": 767.90625, "completions/mean_terminated_length": 783.2031860351562, "completions/min_length": 0.0, "completions/min_terminated_length": 495.0, "epoch": 0.08, "grad_norm": 0.008225214667618275, "kl": 0.0906524658203125, "learning_rate": 3.4722222222222224e-06, "loss": -0.0088, "num_tokens": 24905074.0, "reward": 0.8864485025405884, "reward_std": 0.2618010640144348, "rewards/accuracy_reward_step": 0.68359375, "rewards/asymmetric_l2_reward": 0.7516970038414001, "rewards/final_brier_reward_step": 0.6883875131607056, "rewards/format_reward_step": 0.98046875, "step": 75 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5231930213237331, "calib/avg_num_step_conf": 7.81640625, "calib/ece": 0.36082730923694784, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00041622819163655844, "calib/mean_conf": 0.9913493975903614, "calib/mu_c": 0.9915031847133756, "calib/mu_w": 0.9910869565217391, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36082730923694784, "calib/std_conf": 0.0032884136442572405, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9175392670157068, "calib/step_q_c_n": 1337.0, "calib/step_q_gap": 0.011506134485586239, "calib/step_q_w": 0.9060331325301205, "calib/step_q_w_n": 664.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2962.0, "completions/max_terminated_length": 2962.0, "completions/mean_length": 802.3671875, "completions/mean_terminated_length": 818.3506469726562, "completions/min_length": 0.0, "completions/min_terminated_length": 457.0, "epoch": 0.08106666666666666, "grad_norm": 0.004342818632721901, "kl": 0.082427978515625, "learning_rate": 3.444444444444445e-06, "loss": -0.0124, "num_tokens": 25213536.0, "reward": 0.8073495626449585, "reward_std": 0.22171545028686523, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.6834534406661987, "rewards/final_brier_reward_step": 0.6156206130981445, "rewards/format_reward_step": 0.96484375, "step": 76 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4841960434329912, "calib/avg_num_step_conf": 7.52734375, "calib/ece": 0.32991020408163263, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.494868362328372e-05, "calib/mean_conf": 0.991134693877551, "calib/mu_c": 0.9911296296296296, "calib/mu_w": 0.9911445783132529, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.32991020408163263, "calib/std_conf": 0.0038748562064384777, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.9157070552147238, "calib/step_q_c_n": 1304.0, "calib/step_q_gap": 0.009309949105399062, "calib/step_q_w": 0.9063971061093248, "calib/step_q_w_n": 622.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2973.0, "completions/max_terminated_length": 2973.0, "completions/mean_length": 796.47265625, "completions/mean_terminated_length": 822.165283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 393.0, "epoch": 0.08213333333333334, "grad_norm": 0.004987195134162903, "kl": 0.0777130126953125, "learning_rate": 3.416666666666667e-06, "loss": -0.0196, "num_tokens": 25522097.0, "reward": 0.816591739654541, "reward_std": 0.272071897983551, "rewards/accuracy_reward_step": 0.6328125, "rewards/asymmetric_l2_reward": 0.6926058530807495, "rewards/final_brier_reward_step": 0.6265150904655457, "rewards/format_reward_step": 0.9375, "step": 77 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4534714523281596, "calib/avg_num_step_conf": 7.14453125, "calib/ece": 0.34036468253968244, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0008026330376940116, "calib/mean_conf": 0.9911583333333333, "calib/mu_c": 0.9908780487804878, "calib/mu_w": 0.9916806818181818, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34036468253968244, "calib/std_conf": 0.0032513077344550156, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.910915711947627, "calib/step_q_c_n": 1222.0, "calib/step_q_gap": 0.0065277383067702255, "calib/step_q_w": 0.9043879736408568, "calib/step_q_w_n": 607.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2379.0, "completions/max_terminated_length": 2379.0, "completions/mean_length": 841.29296875, "completions/mean_terminated_length": 854.6468505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 449.0, "epoch": 0.0832, "grad_norm": 0.0048126475885510445, "kl": 0.07703399658203125, "learning_rate": 3.3888888888888893e-06, "loss": -0.0472, "num_tokens": 25845492.0, "reward": 0.8382173776626587, "reward_std": 0.19039109349250793, "rewards/accuracy_reward_step": 0.640625, "rewards/asymmetric_l2_reward": 0.7068257331848145, "rewards/final_brier_reward_step": 0.6461716294288635, "rewards/format_reward_step": 0.9765625, "step": 78 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4819757842597689, "calib/avg_num_step_conf": 7.41796875, "calib/ece": 0.35984720000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00037764171711607286, "calib/mean_conf": 0.9918472, "calib/mu_c": 0.9917082278481012, "calib/mu_w": 0.9920858695652173, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.35984720000000003, "calib/std_conf": 0.00402043929938011, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9162189516129032, "calib/step_q_c_n": 1240.0, "calib/step_q_gap": 0.007129422022614906, "calib/step_q_w": 0.9090895295902883, "calib/step_q_w_n": 659.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2280.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 811.25390625, "completions/mean_terminated_length": 827.4143676757812, "completions/min_length": 0.0, "completions/min_terminated_length": 497.0, "epoch": 0.08426666666666667, "grad_norm": 0.004438234027475119, "kl": 0.070159912109375, "learning_rate": 3.3611111111111117e-06, "loss": -0.0114, "num_tokens": 26159549.0, "reward": 0.8238353729248047, "reward_std": 0.22975537180900574, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.7078436017036438, "rewards/final_brier_reward_step": 0.6226396560668945, "rewards/format_reward_step": 0.96875, "step": 79 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4900687547746371, "calib/avg_num_step_conf": 7.953125, "calib/ece": 0.302919028340081, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00019404125286448526, "calib/mean_conf": 0.9911781376518218, "calib/mu_c": 0.9911176470588235, "calib/mu_w": 0.991311688311688, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.302919028340081, "calib/std_conf": 0.003187883147295252, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9157367272727273, "calib/step_q_c_n": 1375.0, "calib/step_q_gap": 0.0034265911153900097, "calib/step_q_w": 0.9123101361573372, "calib/step_q_w_n": 661.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2978.0, "completions/max_terminated_length": 2978.0, "completions/mean_length": 773.28125, "completions/mean_terminated_length": 795.0200805664062, "completions/min_length": 0.0, "completions/min_terminated_length": 534.0, "epoch": 0.08533333333333333, "grad_norm": 0.005505390930920839, "kl": 0.080474853515625, "learning_rate": 3.3333333333333333e-06, "loss": 0.0005, "num_tokens": 26459669.0, "reward": 0.8524667024612427, "reward_std": 0.2551957368850708, "rewards/accuracy_reward_step": 0.6640625, "rewards/asymmetric_l2_reward": 0.7146350741386414, "rewards/final_brier_reward_step": 0.665298342704773, "rewards/format_reward_step": 0.9609375, "step": 80 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5199259581881535, "calib/avg_num_step_conf": 7.8671875, "calib/ece": 0.318744, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.000526422764227652, "calib/mean_conf": 0.9907440000000001, "calib/mu_c": 0.9909166666666666, "calib/mu_w": 0.9903902439024389, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.318744, "calib/std_conf": 0.00405345087548869, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9207250177179306, "calib/step_q_c_n": 1411.0, "calib/step_q_gap": 0.013496161996537581, "calib/step_q_w": 0.907228855721393, "calib/step_q_w_n": 603.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2670.0, "completions/max_terminated_length": 2670.0, "completions/mean_length": 819.91015625, "completions/mean_terminated_length": 829.6324462890625, "completions/min_length": 0.0, "completions/min_terminated_length": 470.0, "epoch": 0.0864, "grad_norm": 0.004928308539092541, "kl": 0.07088470458984375, "learning_rate": 3.3055555555555558e-06, "loss": -0.0144, "num_tokens": 26775814.0, "reward": 0.8462182879447937, "reward_std": 0.25106942653656006, "rewards/accuracy_reward_step": 0.65625, "rewards/asymmetric_l2_reward": 0.7091135382652283, "rewards/final_brier_reward_step": 0.6583229303359985, "rewards/format_reward_step": 0.96875, "step": 81 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5263157894736842, "calib/avg_num_step_conf": 7.92578125, "calib/ece": 0.370248, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0009602716468593009, "calib/mean_conf": 0.990248, "calib/mu_c": 0.9906129032258065, "calib/mu_w": 0.9896526315789472, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.370248, "calib/std_conf": 0.003966168932357777, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9210427018633541, "calib/step_q_c_n": 1288.0, "calib/step_q_gap": 0.010635144508428285, "calib/step_q_w": 0.9104075573549258, "calib/step_q_w_n": 741.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2816.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 784.0390625, "completions/mean_terminated_length": 796.4841918945312, "completions/min_length": 0.0, "completions/min_terminated_length": 537.0, "epoch": 0.08746666666666666, "grad_norm": 0.004967535380274057, "kl": 0.07586669921875, "learning_rate": 3.277777777777778e-06, "loss": 0.0084, "num_tokens": 27082080.0, "reward": 0.8055253028869629, "reward_std": 0.19441071152687073, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.6871503591537476, "rewards/final_brier_reward_step": 0.6090565919876099, "rewards/format_reward_step": 0.96875, "step": 82 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.482809588072746, "calib/avg_num_step_conf": 7.84375, "calib/ece": 0.4586956000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001689287320867372, "calib/mean_conf": 0.9906956000000001, "calib/mu_c": 0.9906165413533833, "calib/mu_w": 0.99078547008547, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4586956000000001, "calib/std_conf": 0.003282100035038545, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9204350940017905, "calib/step_q_c_n": 1117.0, "calib/step_q_gap": 0.00724429714432695, "calib/step_q_w": 0.9131907968574635, "calib/step_q_w_n": 891.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1702.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 820.61328125, "completions/mean_terminated_length": 836.960205078125, "completions/min_length": 0.0, "completions/min_terminated_length": 526.0, "epoch": 0.08853333333333334, "grad_norm": 0.004836047068238258, "kl": 0.07021331787109375, "learning_rate": 3.2500000000000002e-06, "loss": -0.0354, "num_tokens": 27399421.0, "reward": 0.7269289493560791, "reward_std": 0.18077202141284943, "rewards/accuracy_reward_step": 0.5234375, "rewards/asymmetric_l2_reward": 0.6259989738464355, "rewards/final_brier_reward_step": 0.5278588533401489, "rewards/format_reward_step": 0.9765625, "step": 83 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4943981240229286, "calib/avg_num_step_conf": 8.1953125, "calib/ece": 0.38960474308300397, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -8.370244919175818e-05, "calib/mean_conf": 0.990395256916996, "calib/mu_c": 0.9903618421052633, "calib/mu_w": 0.9904455445544551, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38960474308300397, "calib/std_conf": 0.002242664201469968, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9229286775631502, "calib/step_q_c_n": 1346.0, "calib/step_q_gap": 0.013153411605703491, "calib/step_q_w": 0.9097752659574467, "calib/step_q_w_n": 752.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3011.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 787.5703125, "completions/mean_terminated_length": 793.7716674804688, "completions/min_length": 0.0, "completions/min_terminated_length": 491.0, "epoch": 0.0896, "grad_norm": 0.005793274845927954, "kl": 0.07592010498046875, "learning_rate": 3.2222222222222227e-06, "loss": 0.0023, "num_tokens": 27706959.0, "reward": 0.8168776035308838, "reward_std": 0.2131912112236023, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.7161560654640198, "rewards/final_brier_reward_step": 0.6011929512023926, "rewards/format_reward_step": 0.98828125, "step": 84 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4797160243407708, "calib/avg_num_step_conf": 8.15234375, "calib/ece": 0.40364372469635623, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.000364029749830852, "calib/mean_conf": 0.9906882591093117, "calib/mu_c": 0.9905379310344826, "calib/mu_w": 0.9909019607843135, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40364372469635623, "calib/std_conf": 0.00261805932345344, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9229945355191257, "calib/step_q_c_n": 1281.0, "calib/step_q_gap": 0.012014386635751051, "calib/step_q_w": 0.9109801488833746, "calib/step_q_w_n": 806.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2815.0, "completions/max_terminated_length": 2815.0, "completions/mean_length": 850.41015625, "completions/mean_terminated_length": 863.9087524414062, "completions/min_length": 0.0, "completions/min_terminated_length": 495.0, "epoch": 0.09066666666666667, "grad_norm": 0.0050011295825243, "kl": 0.0685577392578125, "learning_rate": 3.1944444444444443e-06, "loss": 0.0118, "num_tokens": 28032488.0, "reward": 0.7705468535423279, "reward_std": 0.22396138310432434, "rewards/accuracy_reward_step": 0.56640625, "rewards/asymmetric_l2_reward": 0.6612777709960938, "rewards/final_brier_reward_step": 0.5735659599304199, "rewards/format_reward_step": 0.96484375, "step": 85 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.47517161737345226, "calib/avg_num_step_conf": 8.1015625, "calib/ece": 0.42288492063492056, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00032931288894577637, "calib/mean_conf": 0.990345238095238, "calib/mu_c": 0.9902027972027971, "calib/mu_w": 0.9905321100917429, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42288492063492056, "calib/std_conf": 0.0027767203202630013, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9197145187601959, "calib/step_q_c_n": 1226.0, "calib/step_q_gap": 0.006379613099818493, "calib/step_q_w": 0.9133349056603774, "calib/step_q_w_n": 848.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2963.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 850.4453125, "completions/mean_terminated_length": 860.5296630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 542.0, "epoch": 0.09173333333333333, "grad_norm": 0.005102749913930893, "kl": 0.07195281982421875, "learning_rate": 3.1666666666666667e-06, "loss": -0.0102, "num_tokens": 28355714.0, "reward": 0.7638604640960693, "reward_std": 0.23303180932998657, "rewards/accuracy_reward_step": 0.55859375, "rewards/asymmetric_l2_reward": 0.6525702476501465, "rewards/final_brier_reward_step": 0.5665569305419922, "rewards/format_reward_step": 0.984375, "step": 86 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.521505376344086, "calib/avg_num_step_conf": 8.33203125, "calib/ece": 0.237246963562753, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003763440860211631, "calib/mean_conf": 0.9902834008097166, "calib/mu_c": 0.9903763440860214, "calib/mu_w": 0.9900000000000002, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.237246963562753, "calib/std_conf": 0.0015613784309172973, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9218906832298137, "calib/step_q_c_n": 1610.0, "calib/step_q_gap": 0.005447088583542148, "calib/step_q_w": 0.9164435946462716, "calib/step_q_w_n": 523.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2970.0, "completions/max_terminated_length": 2970.0, "completions/mean_length": 805.390625, "completions/mean_terminated_length": 831.3709716796875, "completions/min_length": 0.0, "completions/min_terminated_length": 532.0, "epoch": 0.0928, "grad_norm": 0.0050618527457118034, "kl": 0.07224273681640625, "learning_rate": 3.138888888888889e-06, "loss": -0.0338, "num_tokens": 28667390.0, "reward": 0.9251996874809265, "reward_std": 0.23489241302013397, "rewards/accuracy_reward_step": 0.7265625, "rewards/asymmetric_l2_reward": 0.7817424535751343, "rewards/final_brier_reward_step": 0.7311569452285767, "rewards/format_reward_step": 0.9609375, "step": 87 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4774013402829486, "calib/avg_num_step_conf": 7.7109375, "calib/ece": 0.3074859437751004, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00014586746090783365, "calib/mean_conf": 0.9902168674698796, "calib/mu_c": 0.9901705882352941, "calib/mu_w": 0.990316455696202, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3074859437751004, "calib/std_conf": 0.0025930778696092926, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.9233775071633237, "calib/step_q_c_n": 1396.0, "calib/step_q_gap": 0.012166434498963707, "calib/step_q_w": 0.91121107266436, "calib/step_q_w_n": 578.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1921.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 837.984375, "completions/mean_terminated_length": 851.2857666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.09386666666666667, "grad_norm": 0.0049079423770308495, "kl": 0.06745147705078125, "learning_rate": 3.1111111111111116e-06, "loss": -0.0159, "num_tokens": 28991762.0, "reward": 0.8723215460777283, "reward_std": 0.21739079058170319, "rewards/accuracy_reward_step": 0.6640625, "rewards/asymmetric_l2_reward": 0.7529059648513794, "rewards/final_brier_reward_step": 0.6659557819366455, "rewards/format_reward_step": 0.96484375, "step": 88 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.4989224137931034, "calib/avg_num_step_conf": 7.87890625, "calib/ece": 0.4656352459016394, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.4008620689454077e-05, "calib/mean_conf": 0.9902254098360657, "calib/mu_c": 0.9902187500000001, "calib/mu_w": 0.9902327586206896, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4656352459016394, "calib/std_conf": 0.0019126439290279625, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9246768128916741, "calib/step_q_c_n": 1117.0, "calib/step_q_gap": 0.013053479558340797, "calib/step_q_w": 0.9116233333333333, "calib/step_q_w_n": 900.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2996.0, "completions/max_terminated_length": 2996.0, "completions/mean_length": 876.921875, "completions/mean_terminated_length": 905.2096557617188, "completions/min_length": 0.0, "completions/min_terminated_length": 488.0, "epoch": 0.09493333333333333, "grad_norm": 0.004340740852057934, "kl": 0.06580352783203125, "learning_rate": 3.0833333333333336e-06, "loss": -0.0493, "num_tokens": 29325142.0, "reward": 0.7178754806518555, "reward_std": 0.19721011817455292, "rewards/accuracy_reward_step": 0.50390625, "rewards/asymmetric_l2_reward": 0.6355875730514526, "rewards/final_brier_reward_step": 0.5087569952011108, "rewards/format_reward_step": 0.953125, "step": 89 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.49875539937037855, "calib/avg_num_step_conf": 8.67578125, "calib/ece": 0.3468770491803278, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.1231422504985886e-05, "calib/mean_conf": 0.9903196721311475, "calib/mu_c": 0.990312101910828, "calib/mu_w": 0.990333333333333, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3468770491803278, "calib/std_conf": 0.0017380335290896219, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.924378156996587, "calib/step_q_c_n": 1465.0, "calib/step_q_gap": 0.0031969400653700797, "calib/step_q_w": 0.9211812169312169, "calib/step_q_w_n": 756.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2461.0, "completions/max_terminated_length": 2461.0, "completions/mean_length": 826.85546875, "completions/mean_terminated_length": 863.9795532226562, "completions/min_length": 0.0, "completions/min_terminated_length": 336.0, "epoch": 0.096, "grad_norm": 0.004711349029093981, "kl": 0.070892333984375, "learning_rate": 3.055555555555556e-06, "loss": -0.0575, "num_tokens": 29640137.0, "reward": 0.8198519945144653, "reward_std": 0.2870404124259949, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.7075222134590149, "rewards/final_brier_reward_step": 0.6196815967559814, "rewards/format_reward_step": 0.94921875, "step": 90 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4888357825652732, "calib/avg_num_step_conf": 8.2734375, "calib/ece": 0.3355742971887552, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00020288200884555074, "calib/mean_conf": 0.9901927710843376, "calib/mu_c": 0.990122699386503, "calib/mu_w": 0.9903255813953485, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3355742971887552, "calib/std_conf": 0.0013484290844498997, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.9262064156206417, "calib/step_q_c_n": 1434.0, "calib/step_q_gap": 0.01088770217034929, "calib/step_q_w": 0.9153187134502924, "calib/step_q_w_n": 684.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1977.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 870.3046875, "completions/mean_terminated_length": 891.1920166015625, "completions/min_length": 0.0, "completions/min_terminated_length": 581.0, "epoch": 0.09706666666666666, "grad_norm": 0.005069150123745203, "kl": 0.068084716796875, "learning_rate": 3.0277777777777776e-06, "loss": -0.0399, "num_tokens": 29970647.0, "reward": 0.8304876089096069, "reward_std": 0.24938277900218964, "rewards/accuracy_reward_step": 0.63671875, "rewards/asymmetric_l2_reward": 0.707068920135498, "rewards/final_brier_reward_step": 0.6351562738418579, "rewards/format_reward_step": 0.95703125, "step": 91 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5065629904408617, "calib/avg_num_step_conf": 9.01171875, "calib/ece": 0.3352449799196787, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": -0.00036845484377190374, "calib/mean_conf": 0.9898634538152611, "calib/mu_c": 0.9897361963190184, "calib/mu_w": 0.9901046511627903, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3352449799196787, "calib/std_conf": 0.0058835272831902164, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9270372062663186, "calib/step_q_c_n": 1532.0, "calib/step_q_gap": 0.006266883685673408, "calib/step_q_w": 0.9207703225806452, "calib/step_q_w_n": 775.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2705.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 818.80078125, "completions/mean_terminated_length": 838.4520263671875, "completions/min_length": 0.0, "completions/min_terminated_length": 582.0, "epoch": 0.09813333333333334, "grad_norm": 0.005148807540535927, "kl": 0.07390594482421875, "learning_rate": 3e-06, "loss": -0.0121, "num_tokens": 30286980.0, "reward": 0.8529649972915649, "reward_std": 0.20928457379341125, "rewards/accuracy_reward_step": 0.63671875, "rewards/asymmetric_l2_reward": 0.740821361541748, "rewards/final_brier_reward_step": 0.6432335376739502, "rewards/format_reward_step": 0.97265625, "step": 92 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.487980275323608, "calib/avg_num_step_conf": 8.75390625, "calib/ece": 0.36226800000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002238887747415541, "calib/mean_conf": 0.990268, "calib/mu_c": 0.9901847133757958, "calib/mu_w": 0.9904086021505374, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36226800000000003, "calib/std_conf": 0.0018166386542182804, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9278586206896552, "calib/step_q_c_n": 1450.0, "calib/step_q_gap": 0.004825750904572934, "calib/step_q_w": 0.9230328697850823, "calib/step_q_w_n": 791.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3050.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 858.32421875, "completions/mean_terminated_length": 871.948486328125, "completions/min_length": 0.0, "completions/min_terminated_length": 562.0, "epoch": 0.0992, "grad_norm": 0.005295910406857729, "kl": 0.07146453857421875, "learning_rate": 2.9722222222222225e-06, "loss": -0.0174, "num_tokens": 30612487.0, "reward": 0.8195692300796509, "reward_std": 0.25328242778778076, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.7010154128074646, "rewards/final_brier_reward_step": 0.6201542615890503, "rewards/format_reward_step": 0.9765625, "step": 93 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4979994481236203, "calib/avg_num_step_conf": 8.26953125, "calib/ece": 0.37865991902834006, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.000177014348786253, "calib/mean_conf": 0.989995951417004, "calib/mu_c": 0.9899271523178805, "calib/mu_w": 0.9901041666666668, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.37865991902834006, "calib/std_conf": 0.0023645457759122867, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9265103397341212, "calib/step_q_c_n": 1354.0, "calib/step_q_gap": 0.012775084164003192, "calib/step_q_w": 0.913735255570118, "calib/step_q_w_n": 763.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3012.0, "completions/max_terminated_length": 3012.0, "completions/mean_length": 871.8046875, "completions/mean_terminated_length": 889.1713256835938, "completions/min_length": 0.0, "completions/min_terminated_length": 571.0, "epoch": 0.10026666666666667, "grad_norm": 0.0050525604747235775, "kl": 0.0719146728515625, "learning_rate": 2.944444444444445e-06, "loss": -0.0031, "num_tokens": 30944349.0, "reward": 0.7948108911514282, "reward_std": 0.20116651058197021, "rewards/accuracy_reward_step": 0.58984375, "rewards/asymmetric_l2_reward": 0.6823795437812805, "rewards/final_brier_reward_step": 0.5970859527587891, "rewards/format_reward_step": 0.9609375, "step": 94 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.514673311184939, "calib/avg_num_step_conf": 8.80859375, "calib/ece": 0.32853937007874, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.000989341085271489, "calib/mean_conf": 0.9899566929133857, "calib/mu_c": 0.9902916666666667, "calib/mu_w": 0.9893023255813952, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32853937007874, "calib/std_conf": 0.004645090381653441, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9304301007556676, "calib/step_q_c_n": 1588.0, "calib/step_q_gap": 0.017356637487301763, "calib/step_q_w": 0.9130734632683658, "calib/step_q_w_n": 667.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2072.0, "completions/max_terminated_length": 2072.0, "completions/mean_length": 832.890625, "completions/mean_terminated_length": 839.4487915039062, "completions/min_length": 0.0, "completions/min_terminated_length": 482.0, "epoch": 0.10133333333333333, "grad_norm": 0.005352802574634552, "kl": 0.0714111328125, "learning_rate": 2.916666666666667e-06, "loss": 0.0029, "num_tokens": 31263697.0, "reward": 0.8585090637207031, "reward_std": 0.22589869797229767, "rewards/accuracy_reward_step": 0.65625, "rewards/asymmetric_l2_reward": 0.7240145206451416, "rewards/final_brier_reward_step": 0.663316011428833, "rewards/format_reward_step": 0.9921875, "step": 95 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5102040816326531, "calib/avg_num_step_conf": 8.4296875, "calib/ece": 0.18564000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.996, "calib/gap": 0.001836734693877351, "calib/mean_conf": 0.9896400000000001, "calib/mu_c": 0.99, "calib/mu_w": 0.9881632653061226, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18564000000000003, "calib/std_conf": 0.005819828176157777, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9281357818586534, "calib/step_q_c_n": 1797.0, "calib/step_q_gap": 0.016833842800481613, "calib/step_q_w": 0.9113019390581718, "calib/step_q_w_n": 361.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3063.0, "completions/max_terminated_length": 3063.0, "completions/mean_length": 784.93359375, "completions/mean_terminated_length": 791.1141967773438, "completions/min_length": 0.0, "completions/min_terminated_length": 76.0, "epoch": 0.1024, "grad_norm": 0.005755927413702011, "kl": 0.0801849365234375, "learning_rate": 2.888888888888889e-06, "loss": -0.0024, "num_tokens": 31570456.0, "reward": 0.9797643423080444, "reward_std": 0.14108261466026306, "rewards/accuracy_reward_step": 0.78515625, "rewards/asymmetric_l2_reward": 0.8193533420562744, "rewards/final_brier_reward_step": 0.7893941402435303, "rewards/format_reward_step": 0.96875, "step": 96 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.49388794567062816, "calib/avg_num_step_conf": 8.42578125, "calib/ece": 0.3702320000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00011816638370087951, "calib/mean_conf": 0.9902320000000001, "calib/mu_c": 0.9901870967741935, "calib/mu_w": 0.9903052631578944, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3702320000000001, "calib/std_conf": 0.0014812751263691716, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9278878157503714, "calib/step_q_c_n": 1346.0, "calib/step_q_gap": 0.006870553111653854, "calib/step_q_w": 0.9210172626387175, "calib/step_q_w_n": 811.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2962.0, "completions/max_terminated_length": 2962.0, "completions/mean_length": 834.37109375, "completions/mean_terminated_length": 847.6151123046875, "completions/min_length": 0.0, "completions/min_terminated_length": 387.0, "epoch": 0.10346666666666667, "grad_norm": 0.007479469757527113, "kl": 0.07452392578125, "learning_rate": 2.861111111111111e-06, "loss": -0.0055, "num_tokens": 31889127.0, "reward": 0.798133134841919, "reward_std": 0.30348360538482666, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.6690092086791992, "rewards/final_brier_reward_step": 0.612413227558136, "rewards/format_reward_step": 0.96875, "step": 97 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5033557046979866, "calib/avg_num_step_conf": 8.08203125, "calib/ece": 0.38186938775510204, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.3691275167522257e-05, "calib/mean_conf": 0.9900326530612245, "calib/mu_c": 0.9900536912751676, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.38186938775510204, "calib/std_conf": 0.000510057121691864, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.923427536231884, "calib/step_q_c_n": 1242.0, "calib/step_q_gap": 0.0032098820601790257, "calib/step_q_w": 0.920217654171705, "calib/step_q_w_n": 827.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2976.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 822.546875, "completions/mean_terminated_length": 845.670654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 408.0, "epoch": 0.10453333333333334, "grad_norm": 0.0054461113177239895, "kl": 0.0721435546875, "learning_rate": 2.8333333333333335e-06, "loss": 0.0025, "num_tokens": 32205883.0, "reward": 0.7846657037734985, "reward_std": 0.2299382984638214, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.6729419827461243, "rewards/final_brier_reward_step": 0.5893582105636597, "rewards/format_reward_step": 0.953125, "step": 98 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4964211348256116, "calib/avg_num_step_conf": 7.6640625, "calib/ece": 0.5362168674698796, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -5.8823529411999864e-05, "calib/mean_conf": 0.9900321285140563, "calib/mu_c": 0.99, "calib/mu_w": 0.990058823529412, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.5362168674698796, "calib/std_conf": 0.0016302154142128443, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9243573667711599, "calib/step_q_c_n": 957.0, "calib/step_q_gap": 0.007462839407975763, "calib/step_q_w": 0.9168945273631841, "calib/step_q_w_n": 1005.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3060.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 867.2109375, "completions/mean_terminated_length": 888.0240478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 391.0, "epoch": 0.1056, "grad_norm": 0.0054686726070940495, "kl": 0.06401824951171875, "learning_rate": 2.805555555555556e-06, "loss": -0.035, "num_tokens": 32533689.0, "reward": 0.6467150449752808, "reward_std": 0.30831170082092285, "rewards/accuracy_reward_step": 0.44140625, "rewards/asymmetric_l2_reward": 0.5605430006980896, "rewards/final_brier_reward_step": 0.4516370892524719, "rewards/format_reward_step": 0.96484375, "step": 99 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5060975609756098, "calib/avg_num_step_conf": 7.73828125, "calib/ece": 0.32053061224489776, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 6.24811803673353e-05, "calib/mean_conf": 0.9899183673469386, "calib/mu_c": 0.9899390243902438, "calib/mu_w": 0.9898765432098765, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.32053061224489776, "calib/std_conf": 0.0015627909974756458, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9257551896921975, "calib/step_q_c_n": 1397.0, "calib/step_q_gap": 0.018169573253841387, "calib/step_q_w": 0.9075856164383561, "calib/step_q_w_n": 584.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2148.0, "completions/max_terminated_length": 2148.0, "completions/mean_length": 847.85546875, "completions/mean_terminated_length": 871.6907348632812, "completions/min_length": 0.0, "completions/min_terminated_length": 566.0, "epoch": 0.10666666666666667, "grad_norm": 0.005433334968984127, "kl": 0.06568145751953125, "learning_rate": 2.7777777777777783e-06, "loss": 0.0017, "num_tokens": 32858148.0, "reward": 0.8413238525390625, "reward_std": 0.1991977095603943, "rewards/accuracy_reward_step": 0.640625, "rewards/asymmetric_l2_reward": 0.7170438766479492, "rewards/final_brier_reward_step": 0.6468539237976074, "rewards/format_reward_step": 0.953125, "step": 100 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.49193548387096775, "calib/avg_num_step_conf": 7.47265625, "calib/ece": 0.48804819277108447, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00011290322580648215, "calib/mean_conf": 0.9900562248995985, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9901129032258064, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.48804819277108447, "calib/std_conf": 0.0006500319776058185, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9235257731958764, "calib/step_q_c_n": 970.0, "calib/step_q_gap": 0.007631817734582658, "calib/step_q_w": 0.9158939554612937, "calib/step_q_w_n": 943.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2855.0, "completions/max_terminated_length": 2855.0, "completions/mean_length": 897.859375, "completions/mean_terminated_length": 908.5059814453125, "completions/min_length": 0.0, "completions/min_terminated_length": 496.0, "epoch": 0.10773333333333333, "grad_norm": 0.005266252439469099, "kl": 0.06288909912109375, "learning_rate": 2.7500000000000004e-06, "loss": 0.0061, "num_tokens": 33194992.0, "reward": 0.696447491645813, "reward_std": 0.26901131868362427, "rewards/accuracy_reward_step": 0.48828125, "rewards/asymmetric_l2_reward": 0.6076316833496094, "rewards/final_brier_reward_step": 0.49385690689086914, "rewards/format_reward_step": 0.96875, "step": 101 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5056179775280899, "calib/avg_num_step_conf": 8.453125, "calib/ece": 0.29203921568627444, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00011235955056221325, "calib/mean_conf": 0.9900784313725489, "calib/mu_c": 0.9901123595505619, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29203921568627444, "calib/std_conf": 0.0012499903882752347, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9253944836433612, "calib/step_q_c_n": 1559.0, "calib/step_q_gap": 0.009543243973939686, "calib/step_q_w": 0.9158512396694215, "calib/step_q_w_n": 605.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2082.0, "completions/max_terminated_length": 2082.0, "completions/mean_length": 804.48828125, "completions/mean_terminated_length": 807.6431884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 464.0, "epoch": 0.1088, "grad_norm": 0.005804200656712055, "kl": 0.0684051513671875, "learning_rate": 2.7222222222222224e-06, "loss": -0.0227, "num_tokens": 33507637.0, "reward": 0.8951650857925415, "reward_std": 0.1038808822631836, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.7516794204711914, "rewards/final_brier_reward_step": 0.7011507749557495, "rewards/format_reward_step": 0.9921875, "step": 102 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.49127270157586034, "calib/avg_num_step_conf": 7.22265625, "calib/ece": 0.34886693548387115, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00016401667726673175, "calib/mean_conf": 0.9899959677419357, "calib/mu_c": 0.9899371069182389, "calib/mu_w": 0.9901011235955056, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.34886693548387115, "calib/std_conf": 0.0008542964652697105, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9279089506172841, "calib/step_q_c_n": 1296.0, "calib/step_q_gap": 0.0188962694578636, "calib/step_q_w": 0.9090126811594205, "calib/step_q_w_n": 552.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1816.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 886.0703125, "completions/mean_terminated_length": 907.3360595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 529.0, "epoch": 0.10986666666666667, "grad_norm": 0.004914054647088051, "kl": 0.05438232421875, "learning_rate": 2.6944444444444444e-06, "loss": -0.0343, "num_tokens": 33839023.0, "reward": 0.8248037099838257, "reward_std": 0.1900019645690918, "rewards/accuracy_reward_step": 0.62109375, "rewards/asymmetric_l2_reward": 0.7054777145385742, "rewards/final_brier_reward_step": 0.6277234554290771, "rewards/format_reward_step": 0.9609375, "step": 103 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5124609578344612, "calib/avg_num_step_conf": 7.74609375, "calib/ece": 0.44361445783132536, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006045028630921045, "calib/mean_conf": 0.9897991967871487, "calib/mu_c": 0.9900735294117647, "calib/mu_w": 0.9894690265486726, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.44361445783132536, "calib/std_conf": 0.002755031788938556, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.9233913043478261, "calib/step_q_c_n": 1150.0, "calib/step_q_gap": 0.007376898585521263, "calib/step_q_w": 0.9160144057623049, "calib/step_q_w_n": 833.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1909.0, "completions/max_terminated_length": 1909.0, "completions/mean_length": 839.453125, "completions/mean_terminated_length": 852.77783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.11093333333333333, "grad_norm": 0.005728873424232006, "kl": 0.06304931640625, "learning_rate": 2.666666666666667e-06, "loss": -0.0487, "num_tokens": 34160603.0, "reward": 0.7302390336990356, "reward_std": 0.22601860761642456, "rewards/accuracy_reward_step": 0.53125, "rewards/asymmetric_l2_reward": 0.6333199143409729, "rewards/final_brier_reward_step": 0.5287206768989563, "rewards/format_reward_step": 0.9609375, "step": 104 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5056179775280899, "calib/avg_num_step_conf": 7.75, "calib/ece": 0.3445418326693228, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00011235955056165814, "calib/mean_conf": 0.9899601593625499, "calib/mu_c": 0.99, "calib/mu_w": 0.9898876404494383, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3445418326693228, "calib/std_conf": 0.0006299357888781638, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9251345454545454, "calib/step_q_c_n": 1375.0, "calib/step_q_gap": 0.014510571727123112, "calib/step_q_w": 0.9106239737274223, "calib/step_q_w_n": 609.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2800.0, "completions/max_terminated_length": 2800.0, "completions/mean_length": 870.8203125, "completions/mean_terminated_length": 881.1463012695312, "completions/min_length": 0.0, "completions/min_terminated_length": 597.0, "epoch": 0.112, "grad_norm": 0.0053633954375982285, "kl": 0.057952880859375, "learning_rate": 2.6388888888888893e-06, "loss": 0.0069, "num_tokens": 34489293.0, "reward": 0.8335608243942261, "reward_std": 0.22055912017822266, "rewards/accuracy_reward_step": 0.6328125, "rewards/asymmetric_l2_reward": 0.7094079256057739, "rewards/final_brier_reward_step": 0.6358386278152466, "rewards/format_reward_step": 0.9765625, "step": 105 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.08984375, "calib/ece": 0.42200000000000015, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.42200000000000015, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9289254210104251, "calib/step_q_c_n": 1247.0, "calib/step_q_gap": 0.009592896738580436, "calib/step_q_w": 0.9193325242718446, "calib/step_q_w_n": 824.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2916.0, "completions/max_terminated_length": 2916.0, "completions/mean_length": 874.92578125, "completions/mean_terminated_length": 885.3004150390625, "completions/min_length": 0.0, "completions/min_terminated_length": 609.0, "epoch": 0.11306666666666666, "grad_norm": 0.005669588688760996, "kl": 0.06478118896484375, "learning_rate": 2.6111111111111113e-06, "loss": 0.0109, "num_tokens": 34817858.0, "reward": 0.7706383466720581, "reward_std": 0.1852344423532486, "rewards/accuracy_reward_step": 0.5546875, "rewards/asymmetric_l2_reward": 0.672858476638794, "rewards/final_brier_reward_step": 0.562949538230896, "rewards/format_reward_step": 0.97265625, "step": 106 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4969715956558062, "calib/avg_num_step_conf": 8.6015625, "calib/ece": 0.3194901960784313, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -6.0568086883594496e-05, "calib/mean_conf": 0.9900784313725489, "calib/mu_c": 0.9900584795321639, "calib/mu_w": 0.9901190476190475, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3194901960784313, "calib/std_conf": 0.0008821350493491768, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9264769647696477, "calib/step_q_c_n": 1476.0, "calib/step_q_gap": 0.0017937691773610709, "calib/step_q_w": 0.9246831955922866, "calib/step_q_w_n": 726.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 837.74609375, "completions/mean_terminated_length": 841.0314331054688, "completions/min_length": 0.0, "completions/min_terminated_length": 533.0, "epoch": 0.11413333333333334, "grad_norm": 0.005860211793333292, "kl": 0.065277099609375, "learning_rate": 2.5833333333333337e-06, "loss": 0.008, "num_tokens": 35136937.0, "reward": 0.8771469593048096, "reward_std": 0.20320641994476318, "rewards/accuracy_reward_step": 0.66796875, "rewards/asymmetric_l2_reward": 0.7518141269683838, "rewards/final_brier_reward_step": 0.6704484224319458, "rewards/format_reward_step": 0.9921875, "step": 107 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5130434782608695, "calib/avg_num_step_conf": 8.34375, "calib/ece": 0.2510803212851407, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0002571070234114181, "calib/mean_conf": 0.9900361445783133, "calib/mu_c": 0.9901032608695653, "calib/mu_w": 0.9898461538461539, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2510803212851407, "calib/std_conf": 0.0010617003464647918, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.928213880126183, "calib/step_q_c_n": 1585.0, "calib/step_q_gap": 0.008195731305856269, "calib/step_q_w": 0.9200181488203267, "calib/step_q_w_n": 551.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1848.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 858.20703125, "completions/mean_terminated_length": 878.8040161132812, "completions/min_length": 0.0, "completions/min_terminated_length": 542.0, "epoch": 0.1152, "grad_norm": 0.0051793381571769714, "kl": 0.06256103515625, "learning_rate": 2.5555555555555557e-06, "loss": -0.0585, "num_tokens": 35459870.0, "reward": 0.9190077781677246, "reward_std": 0.19223986566066742, "rewards/accuracy_reward_step": 0.72265625, "rewards/asymmetric_l2_reward": 0.7760034203529358, "rewards/final_brier_reward_step": 0.7237308025360107, "rewards/format_reward_step": 0.96875, "step": 108 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5042735042735043, "calib/avg_num_step_conf": 7.69921875, "calib/ece": 0.46165322580645163, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0002564102564103221, "calib/mean_conf": 0.9898790322580645, "calib/mu_c": 0.99, "calib/mu_w": 0.9897435897435897, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.46165322580645163, "calib/std_conf": 0.0019011572958268217, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9315300084530854, "calib/step_q_c_n": 1183.0, "calib/step_q_gap": 0.01933457698100427, "calib/step_q_w": 0.9121954314720812, "calib/step_q_w_n": 788.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2762.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 899.1953125, "completions/mean_terminated_length": 917.1076049804688, "completions/min_length": 0.0, "completions/min_terminated_length": 614.0, "epoch": 0.11626666666666667, "grad_norm": 0.004966530948877335, "kl": 0.05828094482421875, "learning_rate": 2.5277777777777778e-06, "loss": 0.0011, "num_tokens": 35794664.0, "reward": 0.7248979806900024, "reward_std": 0.16931957006454468, "rewards/accuracy_reward_step": 0.51171875, "rewards/asymmetric_l2_reward": 0.6382572650909424, "rewards/final_brier_reward_step": 0.5170074105262756, "rewards/format_reward_step": 0.9609375, "step": 109 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.49204129204129204, "calib/avg_num_step_conf": 9.16015625, "calib/ece": 0.3455859375000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001591741591741025, "calib/mean_conf": 0.9901171875000001, "calib/mu_c": 0.990060606060606, "calib/mu_w": 0.9902197802197801, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3455859375000001, "calib/std_conf": 0.0010761701026528064, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9327749029754205, "calib/step_q_c_n": 1546.0, "calib/step_q_gap": 0.004339358544882432, "calib/step_q_w": 0.9284355444305381, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 798.1796875, "completions/mean_terminated_length": 801.3098754882812, "completions/min_length": 0.0, "completions/min_terminated_length": 504.0, "epoch": 0.11733333333333333, "grad_norm": 0.005862347781658173, "kl": 0.06954193115234375, "learning_rate": 2.5e-06, "loss": 0.0115, "num_tokens": 36103918.0, "reward": 0.8563417196273804, "reward_std": 0.24133282899856567, "rewards/accuracy_reward_step": 0.64453125, "rewards/asymmetric_l2_reward": 0.7323915958404541, "rewards/final_brier_reward_step": 0.651385486125946, "rewards/format_reward_step": 1.0, "step": 110 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.508294930875576, "calib/avg_num_step_conf": 8.734375, "calib/ece": 0.3772727272727272, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00037063857801167277, "calib/mean_conf": 0.9899209486166007, "calib/mu_c": 0.9900645161290321, "calib/mu_w": 0.9896938775510205, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3772727272727272, "calib/std_conf": 0.0021764268613337894, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.932752100840336, "calib/step_q_c_n": 1428.0, "calib/step_q_gap": 0.008841209751227153, "calib/step_q_w": 0.9239108910891088, "calib/step_q_w_n": 808.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2145.0, "completions/max_terminated_length": 2145.0, "completions/mean_length": 869.59765625, "completions/mean_terminated_length": 879.9091186523438, "completions/min_length": 0.0, "completions/min_terminated_length": 581.0, "epoch": 0.1184, "grad_norm": 0.00547254690900445, "kl": 0.061725616455078125, "learning_rate": 2.4722222222222226e-06, "loss": -0.0046, "num_tokens": 36433943.0, "reward": 0.8111904263496399, "reward_std": 0.2672760486602783, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.6959226131439209, "rewards/final_brier_reward_step": 0.6092706918716431, "rewards/format_reward_step": 0.98046875, "step": 111 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.492980195333695, "calib/avg_num_step_conf": 8.5, "calib/ece": 0.3796787148594378, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00014039609332594516, "calib/mean_conf": 0.990120481927711, "calib/mu_c": 0.9900657894736843, "calib/mu_w": 0.9902061855670102, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3796787148594378, "calib/std_conf": 0.0010910102576069185, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.934830623306233, "calib/step_q_c_n": 1476.0, "calib/step_q_gap": 0.020387766163375898, "calib/step_q_w": 0.9144428571428571, "calib/step_q_w_n": 700.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1628.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 866.765625, "completions/mean_terminated_length": 887.5680541992188, "completions/min_length": 0.0, "completions/min_terminated_length": 597.0, "epoch": 0.11946666666666667, "grad_norm": 0.0054887994192540646, "kl": 0.05815887451171875, "learning_rate": 2.4444444444444447e-06, "loss": -0.0374, "num_tokens": 36763755.0, "reward": 0.8039360046386719, "reward_std": 0.17428997159004211, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.6935150027275085, "rewards/final_brier_reward_step": 0.6010757684707642, "rewards/format_reward_step": 0.97265625, "step": 112 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5083067517278044, "calib/avg_num_step_conf": 9.53125, "calib/ece": 0.3844223107569721, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00016679957469456497, "calib/mean_conf": 0.99, "calib/mu_c": 0.9900657894736843, "calib/mu_w": 0.9898989898989897, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3844223107569721, "calib/std_conf": 0.0008926436853549044, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9353999999999999, "calib/step_q_c_n": 1500.0, "calib/step_q_gap": 0.007368085106382871, "calib/step_q_w": 0.928031914893617, "calib/step_q_w_n": 940.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2108.0, "completions/max_terminated_length": 2108.0, "completions/mean_length": 840.62109375, "completions/mean_terminated_length": 853.96435546875, "completions/min_length": 0.0, "completions/min_terminated_length": 571.0, "epoch": 0.12053333333333334, "grad_norm": 0.005846248008310795, "kl": 0.06928253173828125, "learning_rate": 2.4166666666666667e-06, "loss": -0.0151, "num_tokens": 37084154.0, "reward": 0.8061826229095459, "reward_std": 0.23986557126045227, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.6960577964782715, "rewards/final_brier_reward_step": 0.6014636754989624, "rewards/format_reward_step": 0.98046875, "step": 113 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4959439810208923, "calib/avg_num_step_conf": 9.890625, "calib/ece": 0.27976190476190477, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -8.11203795821891e-05, "calib/mean_conf": 0.9900793650793651, "calib/mu_c": 0.9900558659217875, "calib/mu_w": 0.9901369863013697, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27976190476190477, "calib/std_conf": 0.0008873285624999172, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9386128364389235, "calib/step_q_c_n": 1932.0, "calib/step_q_gap": 0.01744616977225677, "calib/step_q_w": 0.9211666666666667, "calib/step_q_w_n": 600.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 843.71484375, "completions/mean_terminated_length": 853.7194213867188, "completions/min_length": 0.0, "completions/min_terminated_length": 601.0, "epoch": 0.1216, "grad_norm": 0.005540243349969387, "kl": 0.0713348388671875, "learning_rate": 2.388888888888889e-06, "loss": -0.0212, "num_tokens": 37405169.0, "reward": 0.9041121006011963, "reward_std": 0.21608759462833405, "rewards/accuracy_reward_step": 0.69921875, "rewards/asymmetric_l2_reward": 0.7667593955993652, "rewards/final_brier_reward_step": 0.7047460675239563, "rewards/format_reward_step": 0.984375, "step": 114 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 9.14453125, "calib/ece": 0.4384126984126985, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4384126984126985, "calib/std_conf": 0.0008908708063747488, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9359250936329588, "calib/step_q_c_n": 1335.0, "calib/step_q_gap": 0.007992688066358444, "calib/step_q_w": 0.9279324055666004, "calib/step_q_w_n": 1006.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2821.0, "completions/max_terminated_length": 2821.0, "completions/mean_length": 874.20703125, "completions/mean_terminated_length": 881.090576171875, "completions/min_length": 0.0, "completions/min_terminated_length": 647.0, "epoch": 0.12266666666666666, "grad_norm": 0.005463681183755398, "kl": 0.0593414306640625, "learning_rate": 2.361111111111111e-06, "loss": 0.0265, "num_tokens": 37734230.0, "reward": 0.7447854280471802, "reward_std": 0.2918502688407898, "rewards/accuracy_reward_step": 0.54296875, "rewards/asymmetric_l2_reward": 0.6380273699760437, "rewards/final_brier_reward_step": 0.5476371049880981, "rewards/format_reward_step": 0.9765625, "step": 115 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5059880239520959, "calib/avg_num_step_conf": 9.203125, "calib/ece": 0.33000000000000007, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00011976047904205256, "calib/mean_conf": 0.9900790513833992, "calib/mu_c": 0.9901197604790417, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.33000000000000007, "calib/std_conf": 0.0008855872135339169, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.93751329001772, "calib/step_q_c_n": 1693.0, "calib/step_q_gap": 0.013365477046377694, "calib/step_q_w": 0.9241478129713423, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 885.22265625, "completions/mean_terminated_length": 892.1929321289062, "completions/min_length": 0.0, "completions/min_terminated_length": 527.0, "epoch": 0.12373333333333333, "grad_norm": 0.005527203902602196, "kl": 0.06006622314453125, "learning_rate": 2.3333333333333336e-06, "loss": -0.004, "num_tokens": 38065367.0, "reward": 0.8455690741539001, "reward_std": 0.22002533078193665, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.7151409387588501, "rewards/final_brier_reward_step": 0.6509972810745239, "rewards/format_reward_step": 0.97265625, "step": 116 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 9.328125, "calib/ece": 0.4821259842519685, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.220446049250313e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4821259842519685, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.9390930053804766, "calib/step_q_c_n": 1301.0, "calib/step_q_gap": 0.008163842546989875, "calib/step_q_w": 0.9309291628334867, "calib/step_q_w_n": 1087.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1801.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 856.30078125, "completions/mean_terminated_length": 863.0432739257812, "completions/min_length": 0.0, "completions/min_terminated_length": 624.0, "epoch": 0.1248, "grad_norm": 0.0059682149440050125, "kl": 0.0672149658203125, "learning_rate": 2.305555555555556e-06, "loss": 0.0093, "num_tokens": 38391180.0, "reward": 0.7150706648826599, "reward_std": 0.28290462493896484, "rewards/accuracy_reward_step": 0.50390625, "rewards/asymmetric_l2_reward": 0.6275831460952759, "rewards/final_brier_reward_step": 0.505683183670044, "rewards/format_reward_step": 0.98046875, "step": 117 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.495, "calib/avg_num_step_conf": 9.69921875, "calib/ece": 0.385292490118577, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -8.999999999992347e-05, "calib/mean_conf": 0.9900355731225295, "calib/mu_c": 0.99, "calib/mu_w": 0.9900899999999999, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.385292490118577, "calib/std_conf": 0.0005647058134288065, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9361940768746063, "calib/step_q_c_n": 1587.0, "calib/step_q_gap": 0.006682916160320573, "calib/step_q_w": 0.9295111607142857, "calib/step_q_w_n": 896.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2272.0, "completions/max_terminated_length": 2272.0, "completions/mean_length": 882.9375, "completions/mean_terminated_length": 886.4000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 558.0, "epoch": 0.12586666666666665, "grad_norm": 0.005273620132356882, "kl": 0.06341552734375, "learning_rate": 2.277777777777778e-06, "loss": -0.0084, "num_tokens": 38721220.0, "reward": 0.7888277769088745, "reward_std": 0.21463337540626526, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.65985506772995, "rewards/final_brier_reward_step": 0.6013941168785095, "rewards/format_reward_step": 0.984375, "step": 118 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5083723137414412, "calib/avg_num_step_conf": 9.23828125, "calib/ece": 0.3893951612903227, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00016744627482856433, "calib/mean_conf": 0.990201612903226, "calib/mu_c": 0.9902684563758386, "calib/mu_w": 0.99010101010101, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3893951612903227, "calib/std_conf": 0.0014055181498333384, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9335238095238095, "calib/step_q_c_n": 1365.0, "calib/step_q_gap": -0.0021461904761905792, "calib/step_q_w": 0.9356700000000001, "calib/step_q_w_n": 1000.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2580.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 881.9140625, "completions/mean_terminated_length": 906.706787109375, "completions/min_length": 0.0, "completions/min_terminated_length": 648.0, "epoch": 0.12693333333333334, "grad_norm": 0.005632383283227682, "kl": 0.05834197998046875, "learning_rate": 2.25e-06, "loss": -0.0389, "num_tokens": 39052054.0, "reward": 0.7916051149368286, "reward_std": 0.2506682276725769, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.6834613084793091, "rewards/final_brier_reward_step": 0.5895925760269165, "rewards/format_reward_step": 0.96875, "step": 119 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.4935897435897436, "calib/avg_num_step_conf": 8.703125, "calib/ece": 0.3084081632653062, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00012820512820477248, "calib/mean_conf": 0.9900408163265306, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9901282051282047, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3084081632653062, "calib/std_conf": 0.0006375714021148297, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9386550060313631, "calib/step_q_c_n": 1658.0, "calib/step_q_gap": 0.021988339364696463, "calib/step_q_w": 0.9166666666666666, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2031.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 839.234375, "completions/mean_terminated_length": 869.8137817382812, "completions/min_length": 0.0, "completions/min_terminated_length": 477.0, "epoch": 0.128, "grad_norm": 0.005683529190719128, "kl": 0.06261444091796875, "learning_rate": 2.222222222222222e-06, "loss": -0.0744, "num_tokens": 39373586.0, "reward": 0.8539406061172485, "reward_std": 0.18444295227527618, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.7324292063713074, "rewards/final_brier_reward_step": 0.6543581485748291, "rewards/format_reward_step": 0.953125, "step": 120 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.234375, "calib/ece": 0.37247011952191234, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.220446049250313e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.37247011952191234, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9339252336448599, "calib/step_q_c_n": 1391.0, "calib/step_q_gap": 0.01022927827526443, "calib/step_q_w": 0.9236959553695955, "calib/step_q_w_n": 717.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1811.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 894.49609375, "completions/mean_terminated_length": 908.6945190429688, "completions/min_length": 0.0, "completions/min_terminated_length": 543.0, "epoch": 0.12906666666666666, "grad_norm": 0.005683033261448145, "kl": 0.052341461181640625, "learning_rate": 2.1944444444444445e-06, "loss": -0.0099, "num_tokens": 39707633.0, "reward": 0.810142993927002, "reward_std": 0.2799479365348816, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.6949148178100586, "rewards/final_brier_reward_step": 0.6089648008346558, "rewards/format_reward_step": 0.9765625, "step": 121 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 9.04296875, "calib/ece": 0.37095238095238103, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.99, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37095238095238103, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9404286628278952, "calib/step_q_c_n": 1563.0, "calib/step_q_gap": 0.015561641551299465, "calib/step_q_w": 0.9248670212765957, "calib/step_q_w_n": 752.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 886.609375, "completions/mean_terminated_length": 893.590576171875, "completions/min_length": 0.0, "completions/min_terminated_length": 599.0, "epoch": 0.13013333333333332, "grad_norm": 0.005477050319314003, "kl": 0.05792999267578125, "learning_rate": 2.166666666666667e-06, "loss": -0.0096, "num_tokens": 40041949.0, "reward": 0.8290457725524902, "reward_std": 0.15693755447864532, "rewards/accuracy_reward_step": 0.609375, "rewards/asymmetric_l2_reward": 0.7225650548934937, "rewards/final_brier_reward_step": 0.6167765259742737, "rewards/format_reward_step": 0.984375, "step": 122 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.49900064474532557, "calib/avg_num_step_conf": 8.35546875, "calib/ece": 0.4283266932270916, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.9987105093433044e-05, "calib/mean_conf": 0.9900796812749003, "calib/mu_c": 0.9900709219858154, "calib/mu_w": 0.9900909090909088, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4283266932270916, "calib/std_conf": 0.0008890802232837218, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9342648253452477, "calib/step_q_c_n": 1231.0, "calib/step_q_gap": 0.006148085257141944, "calib/step_q_w": 0.9281167400881057, "calib/step_q_w_n": 908.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2574.0, "completions/max_terminated_length": 2574.0, "completions/mean_length": 926.16015625, "completions/mean_terminated_length": 937.142333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 605.0, "epoch": 0.1312, "grad_norm": 0.005605003330856562, "kl": 0.050136566162109375, "learning_rate": 2.138888888888889e-06, "loss": -0.0243, "num_tokens": 40384334.0, "reward": 0.752850353717804, "reward_std": 0.23959524929523468, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.640251100063324, "rewards/final_brier_reward_step": 0.5591995716094971, "rewards/format_reward_step": 0.98046875, "step": 123 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.640625, "calib/ece": 0.30200000000000016, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.220446049250313e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30200000000000016, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9351959544879899, "calib/step_q_c_n": 1582.0, "calib/step_q_gap": 0.015354684646720074, "calib/step_q_w": 0.9198412698412698, "calib/step_q_w_n": 630.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2876.0, "completions/max_terminated_length": 2876.0, "completions/mean_length": 900.5390625, "completions/mean_terminated_length": 914.8333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 613.0, "epoch": 0.13226666666666667, "grad_norm": 0.005689248442649841, "kl": 0.06095123291015625, "learning_rate": 2.1111111111111114e-06, "loss": -0.0013, "num_tokens": 40721688.0, "reward": 0.8809398412704468, "reward_std": 0.19557476043701172, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.7551801204681396, "rewards/final_brier_reward_step": 0.6777933835983276, "rewards/format_reward_step": 0.97265625, "step": 124 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5073529411764706, "calib/avg_num_step_conf": 7.828125, "calib/ece": 0.4372357723577236, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00014705882352983313, "calib/mean_conf": 0.9900813008130082, "calib/mu_c": 0.9901470588235296, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4372357723577236, "calib/std_conf": 0.0008979968306656318, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9339572649572649, "calib/step_q_c_n": 1170.0, "calib/step_q_gap": 0.010084363278607844, "calib/step_q_w": 0.9238729016786571, "calib/step_q_w_n": 834.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2943.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 972.515625, "completions/mean_terminated_length": 980.1732177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 645.0, "epoch": 0.13333333333333333, "grad_norm": 0.005403018090873957, "kl": 0.0468597412109375, "learning_rate": 2.0833333333333334e-06, "loss": 0.0194, "num_tokens": 41075460.0, "reward": 0.7486984133720398, "reward_std": 0.25946474075317383, "rewards/accuracy_reward_step": 0.53125, "rewards/asymmetric_l2_reward": 0.6592109203338623, "rewards/final_brier_reward_step": 0.5397484302520752, "rewards/format_reward_step": 0.9609375, "step": 125 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5049019607843137, "calib/avg_num_step_conf": 8.578125, "calib/ece": 0.40616326530612246, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00039215686274518546, "calib/mean_conf": 0.9898367346938776, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9896078431372546, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.40616326530612246, "calib/std_conf": 0.002550285608459317, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9396590909090908, "calib/step_q_c_n": 1408.0, "calib/step_q_gap": 0.014443354868481806, "calib/step_q_w": 0.925215736040609, "calib/step_q_w_n": 788.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 887.69921875, "completions/mean_terminated_length": 905.3825073242188, "completions/min_length": 0.0, "completions/min_terminated_length": 585.0, "epoch": 0.1344, "grad_norm": 0.004927393514662981, "kl": 0.054393768310546875, "learning_rate": 2.0555555555555555e-06, "loss": 0.0065, "num_tokens": 41408175.0, "reward": 0.7703766822814941, "reward_std": 0.22320319712162018, "rewards/accuracy_reward_step": 0.55859375, "rewards/asymmetric_l2_reward": 0.6708583831787109, "rewards/final_brier_reward_step": 0.5667698979377747, "rewards/format_reward_step": 0.95703125, "step": 126 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4993502274204028, "calib/avg_num_step_conf": 8.5234375, "calib/ece": 0.4479076305220885, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -4.873294347174095e-06, "calib/mean_conf": 0.9900763052208836, "calib/mu_c": 0.9900740740740739, "calib/mu_w": 0.9900789473684211, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4479076305220885, "calib/std_conf": 0.0008491673237872525, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9387752053771471, "calib/step_q_c_n": 1339.0, "calib/step_q_gap": 0.017132263502888545, "calib/step_q_w": 0.9216429418742585, "calib/step_q_w_n": 843.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 862.7578125, "completions/mean_terminated_length": 872.9881591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 512.0, "epoch": 0.13546666666666668, "grad_norm": 0.00600149342790246, "kl": 0.056011199951171875, "learning_rate": 2.027777777777778e-06, "loss": -0.0121, "num_tokens": 41732713.0, "reward": 0.7328608632087708, "reward_std": 0.23117367923259735, "rewards/accuracy_reward_step": 0.52734375, "rewards/asymmetric_l2_reward": 0.630497395992279, "rewards/final_brier_reward_step": 0.536005437374115, "rewards/format_reward_step": 0.96875, "step": 127 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5037593984962406, "calib/avg_num_step_conf": 8.27734375, "calib/ece": 0.45590361445783145, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 7.518796992500576e-05, "calib/mean_conf": 0.9900401606425704, "calib/mu_c": 0.9900751879699249, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45590361445783145, "calib/std_conf": 0.0006324504316475356, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9362500000000002, "calib/step_q_c_n": 1224.0, "calib/step_q_gap": 0.011155027932961081, "calib/step_q_w": 0.9250949720670392, "calib/step_q_w_n": 895.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3013.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 862.859375, "completions/mean_terminated_length": 883.5680541992188, "completions/min_length": 0.0, "completions/min_terminated_length": 553.0, "epoch": 0.13653333333333334, "grad_norm": 0.005881587043404579, "kl": 0.0509796142578125, "learning_rate": 2.0000000000000003e-06, "loss": -0.0153, "num_tokens": 42060269.0, "reward": 0.7400556802749634, "reward_std": 0.25382572412490845, "rewards/accuracy_reward_step": 0.51953125, "rewards/asymmetric_l2_reward": 0.65317702293396, "rewards/final_brier_reward_step": 0.5284968614578247, "rewards/format_reward_step": 0.97265625, "step": 128 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4971116590348714, "calib/avg_num_step_conf": 9.0625, "calib/ece": 0.32738095238095244, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -5.776681930247385e-05, "calib/mean_conf": 0.9900793650793651, "calib/mu_c": 0.9900598802395209, "calib/mu_w": 0.9901176470588233, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32738095238095244, "calib/std_conf": 0.0008873285624999172, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9375094102885823, "calib/step_q_c_n": 1594.0, "calib/step_q_gap": 0.008900594861584898, "calib/step_q_w": 0.9286088154269974, "calib/step_q_w_n": 726.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1943.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 842.69921875, "completions/mean_terminated_length": 849.3346557617188, "completions/min_length": 0.0, "completions/min_terminated_length": 607.0, "epoch": 0.1376, "grad_norm": 0.005735983140766621, "kl": 0.05672454833984375, "learning_rate": 1.9722222222222224e-06, "loss": 0.0004, "num_tokens": 42378384.0, "reward": 0.8612784147262573, "reward_std": 0.17804312705993652, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.7364044785499573, "rewards/final_brier_reward_step": 0.6588085889816284, "rewards/format_reward_step": 0.984375, "step": 129 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5175119080975064, "calib/avg_num_step_conf": 9.140625, "calib/ece": 0.3312698412698414, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003530400672457912, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9901204819277106, "calib/mu_w": 0.9897674418604648, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3312698412698414, "calib/std_conf": 0.0012598815766974253, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.941146572104019, "calib/step_q_c_n": 1692.0, "calib/step_q_gap": 0.01682558444969795, "calib/step_q_w": 0.924320987654321, "calib/step_q_w_n": 648.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1581.0, "completions/max_terminated_length": 1581.0, "completions/mean_length": 828.35546875, "completions/mean_terminated_length": 831.6039428710938, "completions/min_length": 0.0, "completions/min_terminated_length": 475.0, "epoch": 0.13866666666666666, "grad_norm": 0.005649202037602663, "kl": 0.05800628662109375, "learning_rate": 1.944444444444445e-06, "loss": 0.0021, "num_tokens": 42695731.0, "reward": 0.8597501516342163, "reward_std": 0.16437068581581116, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.7433483004570007, "rewards/final_brier_reward_step": 0.6511518955230713, "rewards/format_reward_step": 0.9765625, "step": 130 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.49642857142857144, "calib/avg_num_step_conf": 8.43359375, "calib/ece": 0.5545564516129033, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -7.142857142861114e-05, "calib/mean_conf": 0.9900403225806452, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9900714285714285, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5545564516129033, "calib/std_conf": 0.0006337190986089405, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9365694849368319, "calib/step_q_c_n": 1029.0, "calib/step_q_gap": 0.010888069007628398, "calib/step_q_w": 0.9256814159292035, "calib/step_q_w_n": 1130.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2980.0, "completions/max_terminated_length": 2980.0, "completions/mean_length": 841.28515625, "completions/mean_terminated_length": 858.0438232421875, "completions/min_length": 0.0, "completions/min_terminated_length": 606.0, "epoch": 0.13973333333333332, "grad_norm": 0.005765112116932869, "kl": 0.05020904541015625, "learning_rate": 1.916666666666667e-06, "loss": -0.0213, "num_tokens": 43017308.0, "reward": 0.6251735687255859, "reward_std": 0.20246624946594238, "rewards/accuracy_reward_step": 0.421875, "rewards/asymmetric_l2_reward": 0.5395841002464294, "rewards/final_brier_reward_step": 0.43263787031173706, "rewards/format_reward_step": 0.96875, "step": 131 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5093550673281361, "calib/avg_num_step_conf": 9.55078125, "calib/ece": 0.3288804780876493, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00019532246633613415, "calib/mean_conf": 0.990235059760956, "calib/mu_c": 0.9903012048192771, "calib/mu_w": 0.990105882352941, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3288804780876493, "calib/std_conf": 0.0015031593211086455, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9406541490006057, "calib/step_q_c_n": 1651.0, "calib/step_q_gap": 0.005754904668112126, "calib/step_q_w": 0.9348992443324936, "calib/step_q_w_n": 794.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1554.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 829.8125, "completions/mean_terminated_length": 842.9841918945312, "completions/min_length": 0.0, "completions/min_terminated_length": 519.0, "epoch": 0.1408, "grad_norm": 0.005729543976485729, "kl": 0.05913543701171875, "learning_rate": 1.888888888888889e-06, "loss": -0.0197, "num_tokens": 43335332.0, "reward": 0.8531327843666077, "reward_std": 0.29228276014328003, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.726431131362915, "rewards/final_brier_reward_step": 0.6548343896865845, "rewards/format_reward_step": 0.9765625, "step": 132 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5042016806722689, "calib/avg_num_step_conf": 8.046875, "calib/ece": 0.5159362549800797, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 8.40336134451114e-05, "calib/mean_conf": 0.9900398406374502, "calib/mu_c": 0.9900840336134452, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5159362549800797, "calib/std_conf": 0.0006299357888781636, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9339700000000001, "calib/step_q_c_n": 1000.0, "calib/step_q_gap": 0.005111509433962325, "calib/step_q_w": 0.9288584905660378, "calib/step_q_w_n": 1060.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1580.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 882.74609375, "completions/mean_terminated_length": 900.3306884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 488.0, "epoch": 0.14186666666666667, "grad_norm": 0.00549586396664381, "kl": 0.04437255859375, "learning_rate": 1.8611111111111113e-06, "loss": -0.0307, "num_tokens": 43667659.0, "reward": 0.6816405057907104, "reward_std": 0.3141142427921295, "rewards/accuracy_reward_step": 0.46484375, "rewards/asymmetric_l2_reward": 0.599159836769104, "rewards/final_brier_reward_step": 0.47505855560302734, "rewards/format_reward_step": 0.98046875, "step": 133 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.511455525606469, "calib/avg_num_step_conf": 8.0078125, "calib/ece": 0.4090118577075099, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00023039404440994282, "calib/mean_conf": 0.9900395256916996, "calib/mu_c": 0.9901360544217686, "calib/mu_w": 0.9899056603773586, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4090118577075099, "calib/std_conf": 0.0010882134306668947, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9377241379310345, "calib/step_q_c_n": 1305.0, "calib/step_q_gap": 0.01595232585049755, "calib/step_q_w": 0.9217718120805369, "calib/step_q_w_n": 745.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1938.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 875.828125, "completions/mean_terminated_length": 886.2135009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 530.0, "epoch": 0.14293333333333333, "grad_norm": 0.005863112863153219, "kl": 0.0474090576171875, "learning_rate": 1.8333333333333333e-06, "loss": -0.0197, "num_tokens": 44000823.0, "reward": 0.7867439985275269, "reward_std": 0.22116048634052277, "rewards/accuracy_reward_step": 0.57421875, "rewards/asymmetric_l2_reward": 0.6831960678100586, "rewards/final_brier_reward_step": 0.5785729885101318, "rewards/format_reward_step": 0.984375, "step": 134 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5033234383066598, "calib/avg_num_step_conf": 8.5546875, "calib/ece": 0.40102766798418976, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00025942178626736645, "calib/mean_conf": 0.9899604743083005, "calib/mu_c": 0.9900671140939596, "calib/mu_w": 0.9898076923076923, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.40102766798418976, "calib/std_conf": 0.002084769485255432, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.9392614047791454, "calib/step_q_c_n": 1381.0, "calib/step_q_gap": 0.01067055187432464, "calib/step_q_w": 0.9285908529048208, "calib/step_q_w_n": 809.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2331.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 892.8046875, "completions/mean_terminated_length": 896.305908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 585.0, "epoch": 0.144, "grad_norm": 0.005807646084576845, "kl": 0.051166534423828125, "learning_rate": 1.8055555555555557e-06, "loss": 0.0149, "num_tokens": 44335261.0, "reward": 0.7879598140716553, "reward_std": 0.24510522186756134, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.6771945953369141, "rewards/final_brier_reward_step": 0.5862249135971069, "rewards/format_reward_step": 0.98046875, "step": 135 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.537602933607823, "calib/avg_num_step_conf": 8.5390625, "calib/ece": 0.45431999999999995, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0007578486875965407, "calib/mean_conf": 0.99032, "calib/mu_c": 0.9906716417910447, "calib/mu_w": 0.9899137931034482, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.45431999999999995, "calib/std_conf": 0.001974234028680492, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9403050595238097, "calib/step_q_c_n": 1344.0, "calib/step_q_gap": 0.016124536958488922, "calib/step_q_w": 0.9241805225653208, "calib/step_q_w_n": 842.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2198.0, "completions/max_terminated_length": 2198.0, "completions/mean_length": 855.94921875, "completions/mean_terminated_length": 873.0, "completions/min_length": 0.0, "completions/min_terminated_length": 537.0, "epoch": 0.14506666666666668, "grad_norm": 0.005918469280004501, "kl": 0.0530548095703125, "learning_rate": 1.777777777777778e-06, "loss": -0.0125, "num_tokens": 44662872.0, "reward": 0.7209823131561279, "reward_std": 0.2641390860080719, "rewards/accuracy_reward_step": 0.5234375, "rewards/asymmetric_l2_reward": 0.612058699131012, "rewards/final_brier_reward_step": 0.5322495698928833, "rewards/format_reward_step": 0.96484375, "step": 136 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.49514563106796117, "calib/avg_num_step_conf": 8.859375, "calib/ece": 0.40204000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -9.70873786407811e-05, "calib/mean_conf": 0.99004, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9900970873786406, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.40204000000000006, "calib/std_conf": 0.0006311893535223806, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9392547906316535, "calib/step_q_c_n": 1409.0, "calib/step_q_gap": 0.008323475148533488, "calib/step_q_w": 0.93093131548312, "calib/step_q_w_n": 859.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2289.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 856.4609375, "completions/mean_terminated_length": 866.6166381835938, "completions/min_length": 0.0, "completions/min_terminated_length": 628.0, "epoch": 0.14613333333333334, "grad_norm": 0.005562786012887955, "kl": 0.056423187255859375, "learning_rate": 1.75e-06, "loss": -0.0209, "num_tokens": 44989110.0, "reward": 0.7751529812812805, "reward_std": 0.22666768729686737, "rewards/accuracy_reward_step": 0.57421875, "rewards/asymmetric_l2_reward": 0.6627465486526489, "rewards/final_brier_reward_step": 0.5781843662261963, "rewards/format_reward_step": 0.97265625, "step": 137 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.496821172469875, "calib/avg_num_step_conf": 9.07421875, "calib/ece": 0.31669354838709685, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -6.357655060262868e-05, "calib/mean_conf": 0.9900806451612904, "calib/mu_c": 0.9900598802395209, "calib/mu_w": 0.9901234567901235, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.31669354838709685, "calib/std_conf": 0.0008943981053555988, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9411660561660562, "calib/step_q_c_n": 1638.0, "calib/step_q_gap": 0.012348537917881042, "calib/step_q_w": 0.9288175182481752, "calib/step_q_w_n": 685.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 856.09375, "completions/mean_terminated_length": 880.1605834960938, "completions/min_length": 0.0, "completions/min_terminated_length": 548.0, "epoch": 0.1472, "grad_norm": 0.008828936144709587, "kl": 0.07733917236328125, "learning_rate": 1.7222222222222224e-06, "loss": -0.048, "num_tokens": 45312606.0, "reward": 0.8505880832672119, "reward_std": 0.26470452547073364, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.723146915435791, "rewards/final_brier_reward_step": 0.6545917987823486, "rewards/format_reward_step": 0.96484375, "step": 138 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4989772727272728, "calib/avg_num_step_conf": 10.08984375, "calib/ece": 0.2889203187250996, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.537878787833847e-05, "calib/mean_conf": 0.9901155378486056, "calib/mu_c": 0.9901079545454546, "calib/mu_w": 0.990133333333333, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2889203187250996, "calib/std_conf": 0.001051747554259193, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9446305931321539, "calib/step_q_c_n": 1922.0, "calib/step_q_gap": 0.012860547746374684, "calib/step_q_w": 0.9317700453857792, "calib/step_q_w_n": 661.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 834.8203125, "completions/mean_terminated_length": 848.0714721679688, "completions/min_length": 0.0, "completions/min_terminated_length": 538.0, "epoch": 0.14826666666666666, "grad_norm": 0.005658579058945179, "kl": 0.06348419189453125, "learning_rate": 1.6944444444444446e-06, "loss": -0.0179, "num_tokens": 45629416.0, "reward": 0.9023306965827942, "reward_std": 0.2051040083169937, "rewards/accuracy_reward_step": 0.6875, "rewards/asymmetric_l2_reward": 0.7778832316398621, "rewards/final_brier_reward_step": 0.6931843161582947, "rewards/format_reward_step": 0.98046875, "step": 139 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4888345864661654, "calib/avg_num_step_conf": 10.1640625, "calib/ece": 0.2930278884462152, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002233082706766254, "calib/mean_conf": 0.9902390438247013, "calib/mu_c": 0.9901714285714284, "calib/mu_w": 0.990394736842105, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2930278884462152, "calib/std_conf": 0.001527513108580146, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.946251287332647, "calib/step_q_c_n": 1942.0, "calib/step_q_gap": 0.011372499453859097, "calib/step_q_w": 0.9348787878787879, "calib/step_q_w_n": 660.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1917.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 853.45703125, "completions/mean_terminated_length": 863.5770874023438, "completions/min_length": 0.0, "completions/min_terminated_length": 566.0, "epoch": 0.14933333333333335, "grad_norm": 0.010869499295949936, "kl": 0.07050323486328125, "learning_rate": 1.6666666666666667e-06, "loss": -0.0113, "num_tokens": 45952917.0, "reward": 0.8823206424713135, "reward_std": 0.2503645420074463, "rewards/accuracy_reward_step": 0.68359375, "rewards/asymmetric_l2_reward": 0.7434866428375244, "rewards/final_brier_reward_step": 0.6891233921051025, "rewards/format_reward_step": 0.9765625, "step": 140 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5058139534883721, "calib/avg_num_step_conf": 9.2578125, "calib/ece": 0.2965322580645161, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.000116279069767522, "calib/mean_conf": 0.9900806451612904, "calib/mu_c": 0.9901162790697673, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2965322580645161, "calib/std_conf": 0.0008943981053555988, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.9439332603938732, "calib/step_q_c_n": 1828.0, "calib/step_q_gap": 0.020298574047009632, "calib/step_q_w": 0.9236346863468635, "calib/step_q_w_n": 542.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 893.41796875, "completions/mean_terminated_length": 900.4527587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 80.0, "epoch": 0.1504, "grad_norm": 0.005139007233083248, "kl": 0.05944061279296875, "learning_rate": 1.638888888888889e-06, "loss": -0.0251, "num_tokens": 46288728.0, "reward": 0.868787407875061, "reward_std": 0.20100921392440796, "rewards/accuracy_reward_step": 0.67578125, "rewards/asymmetric_l2_reward": 0.7364983558654785, "rewards/final_brier_reward_step": 0.6737328171730042, "rewards/format_reward_step": 0.9609375, "step": 141 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5174994729074426, "calib/avg_num_step_conf": 11.21484375, "calib/ece": 0.36837398373983743, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00034998945814912474, "calib/mean_conf": 0.9903252032520325, "calib/mu_c": 0.9904575163398692, "calib/mu_w": 0.9901075268817201, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36837398373983743, "calib/std_conf": 0.0017737743275830446, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9478168202764977, "calib/step_q_c_n": 1736.0, "calib/step_q_gap": -0.00021842201425137375, "calib/step_q_w": 0.9480352422907491, "calib/step_q_w_n": 1135.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2413.0, "completions/max_terminated_length": 2413.0, "completions/mean_length": 879.140625, "completions/mean_terminated_length": 911.1741333007812, "completions/min_length": 0.0, "completions/min_terminated_length": 604.0, "epoch": 0.15146666666666667, "grad_norm": 0.005070097278803587, "kl": 0.05767822265625, "learning_rate": 1.6111111111111113e-06, "loss": -0.0788, "num_tokens": 46618948.0, "reward": 0.7950658202171326, "reward_std": 0.19968588650226593, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.6783492565155029, "rewards/final_brier_reward_step": 0.6008449196815491, "rewards/format_reward_step": 0.95703125, "step": 142 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5051712328767124, "calib/avg_num_step_conf": 11.78515625, "calib/ece": 0.3965447154471545, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004054794520548244, "calib/mean_conf": 0.9900406504065041, "calib/mu_c": 0.9902054794520547, "calib/mu_w": 0.9897999999999999, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3965447154471545, "calib/std_conf": 0.0029214607494034085, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9481639722863743, "calib/step_q_c_n": 1732.0, "calib/step_q_gap": -0.0004119032000069467, "calib/step_q_w": 0.9485758754863812, "calib/step_q_w_n": 1285.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2606.0, "completions/max_terminated_length": 2606.0, "completions/mean_length": 871.84375, "completions/mean_terminated_length": 903.6113891601562, "completions/min_length": 0.0, "completions/min_terminated_length": 611.0, "epoch": 0.15253333333333333, "grad_norm": 0.005727704614400864, "kl": 0.06136322021484375, "learning_rate": 1.5833333333333333e-06, "loss": -0.0503, "num_tokens": 46949476.0, "reward": 0.7622804045677185, "reward_std": 0.24852487444877625, "rewards/accuracy_reward_step": 0.5703125, "rewards/asymmetric_l2_reward": 0.6418509483337402, "rewards/final_brier_reward_step": 0.5780222415924072, "rewards/format_reward_step": 0.953125, "step": 143 }, { "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.5115606936416185, "calib/avg_num_step_conf": 12.29296875, "calib/ece": 0.26932916666666673, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0002254335260113427, "calib/mean_conf": 0.9901625000000001, "calib/mu_c": 0.9902254335260113, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.26932916666666673, "calib/std_conf": 0.001249437373380516, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9409575221238939, "calib/step_q_c_n": 1695.0, "calib/step_q_gap": -0.022279392476657, "calib/step_q_w": 0.9632369146005509, "calib/step_q_w_n": 1452.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 858.515625, "completions/mean_terminated_length": 904.4443969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 646.0, "epoch": 0.1536, "grad_norm": 0.00599814485758543, "kl": 0.061065673828125, "learning_rate": 1.5555555555555558e-06, "loss": -0.0422, "num_tokens": 47273384.0, "reward": 0.8610941171646118, "reward_std": 0.2805580794811249, "rewards/accuracy_reward_step": 0.67578125, "rewards/asymmetric_l2_reward": 0.7203265428543091, "rewards/final_brier_reward_step": 0.6807678937911987, "rewards/format_reward_step": 0.9296875, "step": 144 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5060240963855421, "calib/avg_num_step_conf": 11.3203125, "calib/ece": 0.32072580645161297, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00012048192771074717, "calib/mean_conf": 0.9900806451612904, "calib/mu_c": 0.9901204819277106, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32072580645161297, "calib/std_conf": 0.0008943981053555989, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.944338682899834, "calib/step_q_c_n": 1807.0, "calib/step_q_gap": -0.007173691069001786, "calib/step_q_w": 0.9515123739688358, "calib/step_q_w_n": 1091.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2801.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 882.12109375, "completions/mean_terminated_length": 903.2920532226562, "completions/min_length": 0.0, "completions/min_terminated_length": 582.0, "epoch": 0.15466666666666667, "grad_norm": 0.005229916889220476, "kl": 0.06345367431640625, "learning_rate": 1.527777777777778e-06, "loss": -0.0314, "num_tokens": 47601911.0, "reward": 0.8385009765625, "reward_std": 0.27948635816574097, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.6988167762756348, "rewards/final_brier_reward_step": 0.6547476053237915, "rewards/format_reward_step": 0.96875, "step": 145 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5034776902887138, "calib/avg_num_step_conf": 10.2265625, "calib/ece": 0.4760323886639676, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 6.955380577400305e-05, "calib/mean_conf": 0.9902024291497975, "calib/mu_c": 0.9902362204724408, "calib/mu_w": 0.9901666666666668, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4760323886639676, "calib/std_conf": 0.0014083017919778223, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9437317784256561, "calib/step_q_c_n": 1372.0, "calib/step_q_gap": 0.003041569757919471, "calib/step_q_w": 0.9406902086677367, "calib/step_q_w_n": 1246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3070.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 876.76171875, "completions/mean_terminated_length": 901.4096069335938, "completions/min_length": 0.0, "completions/min_terminated_length": 547.0, "epoch": 0.15573333333333333, "grad_norm": 0.005378072615712881, "kl": 0.061309814453125, "learning_rate": 1.5e-06, "loss": -0.0446, "num_tokens": 47933578.0, "reward": 0.6947678327560425, "reward_std": 0.23827584087848663, "rewards/accuracy_reward_step": 0.5, "rewards/asymmetric_l2_reward": 0.5913490056991577, "rewards/final_brier_reward_step": 0.5052179098129272, "rewards/format_reward_step": 0.96484375, "step": 146 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.47405221630636984, "calib/avg_num_step_conf": 11.54296875, "calib/ece": 0.5144400000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.000518955673872834, "calib/mean_conf": 0.9904400000000001, "calib/mu_c": 0.9901680672268905, "calib/mu_w": 0.9906870229007634, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5144400000000001, "calib/std_conf": 0.0020509509989270853, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9464776119402984, "calib/step_q_c_n": 1340.0, "calib/step_q_gap": -0.00118802273462415, "calib/step_q_w": 0.9476656346749226, "calib/step_q_w_n": 1615.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3019.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 884.7890625, "completions/mean_terminated_length": 902.4143676757812, "completions/min_length": 0.0, "completions/min_terminated_length": 609.0, "epoch": 0.1568, "grad_norm": 0.005266894120723009, "kl": 0.06313323974609375, "learning_rate": 1.4722222222222225e-06, "loss": -0.0287, "num_tokens": 48263764.0, "reward": 0.6583696007728577, "reward_std": 0.1766577810049057, "rewards/accuracy_reward_step": 0.46484375, "rewards/asymmetric_l2_reward": 0.5541763305664062, "rewards/final_brier_reward_step": 0.4742816388607025, "rewards/format_reward_step": 0.9765625, "step": 147 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5229885057471264, "calib/avg_num_step_conf": 13.546875, "calib/ece": 0.27427983539094647, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00045977011494247044, "calib/mean_conf": 0.9903292181069958, "calib/mu_c": 0.9904597701149424, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27427983539094647, "calib/std_conf": 0.0017843196204673922, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9481062469257255, "calib/step_q_c_n": 2033.0, "calib/step_q_gap": -0.014729990008072402, "calib/step_q_w": 0.9628362369337979, "calib/step_q_w_n": 1435.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 846.07421875, "completions/mean_terminated_length": 884.0611572265625, "completions/min_length": 0.0, "completions/min_terminated_length": 585.0, "epoch": 0.15786666666666666, "grad_norm": 0.006033976562321186, "kl": 0.065948486328125, "learning_rate": 1.4444444444444445e-06, "loss": -0.0728, "num_tokens": 48585471.0, "reward": 0.8637349605560303, "reward_std": 0.29628562927246094, "rewards/accuracy_reward_step": 0.6796875, "rewards/asymmetric_l2_reward": 0.7167023420333862, "rewards/final_brier_reward_step": 0.6849863529205322, "rewards/format_reward_step": 0.94921875, "step": 148 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.49508228968592227, "calib/avg_num_step_conf": 10.328125, "calib/ece": 0.3910317460317462, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -9.835420628145908e-05, "calib/mean_conf": 0.9902380952380954, "calib/mu_c": 0.9901986754966886, "calib/mu_w": 0.9902970297029701, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3910317460317462, "calib/std_conf": 0.0015245533898649653, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9458948004836759, "calib/step_q_c_n": 1654.0, "calib/step_q_gap": 0.009228133817009154, "calib/step_q_w": 0.9366666666666668, "calib/step_q_w_n": 990.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 883.15234375, "completions/mean_terminated_length": 893.62451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 613.0, "epoch": 0.15893333333333334, "grad_norm": 0.005128260236233473, "kl": 0.0658111572265625, "learning_rate": 1.4166666666666667e-06, "loss": -0.0232, "num_tokens": 48916014.0, "reward": 0.7963246703147888, "reward_std": 0.24546730518341064, "rewards/accuracy_reward_step": 0.58984375, "rewards/asymmetric_l2_reward": 0.680401623249054, "rewards/final_brier_reward_step": 0.597403883934021, "rewards/format_reward_step": 0.984375, "step": 149 }, { "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.5257165605095542, "calib/avg_num_step_conf": 12.86328125, "calib/ece": 0.3281434599156119, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0005143312101910658, "calib/mean_conf": 0.9905907172995781, "calib/mu_c": 0.990764331210191, "calib/mu_w": 0.99025, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3281434599156119, "calib/std_conf": 0.002357589037080001, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9469197630586969, "calib/step_q_c_n": 1857.0, "calib/step_q_gap": -0.01014848206665131, "calib/step_q_w": 0.9570682451253482, "calib/step_q_w_n": 1436.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 848.53125, "completions/mean_terminated_length": 890.2622680664062, "completions/min_length": 0.0, "completions/min_terminated_length": 604.0, "epoch": 0.16, "grad_norm": 0.00580455781891942, "kl": 0.0648956298828125, "learning_rate": 1.3888888888888892e-06, "loss": -0.059, "num_tokens": 49238198.0, "reward": 0.8208293318748474, "reward_std": 0.2286253273487091, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.7145583629608154, "rewards/final_brier_reward_step": 0.61928790807724, "rewards/format_reward_step": 0.92578125, "step": 150 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5037756550922938, "calib/avg_num_step_conf": 8.5703125, "calib/ece": 0.4800763052208836, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 6.763908609774827e-05, "calib/mean_conf": 0.9901164658634539, "calib/mu_c": 0.9901496062992124, "calib/mu_w": 0.9900819672131147, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4800763052208836, "calib/std_conf": 0.0013849774764524091, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9414787066246058, "calib/step_q_c_n": 1268.0, "calib/step_q_gap": 0.026543501441020467, "calib/step_q_w": 0.9149352051835853, "calib/step_q_w_n": 926.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2751.0, "completions/max_terminated_length": 2751.0, "completions/mean_length": 912.84375, "completions/mean_terminated_length": 934.7520141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 583.0, "epoch": 0.16106666666666666, "grad_norm": 0.005261977203190327, "kl": 0.05495452880859375, "learning_rate": 1.3611111111111112e-06, "loss": -0.0375, "num_tokens": 49578910.0, "reward": 0.7145134210586548, "reward_std": 0.2643086314201355, "rewards/accuracy_reward_step": 0.49609375, "rewards/asymmetric_l2_reward": 0.6306858062744141, "rewards/final_brier_reward_step": 0.5053722262382507, "rewards/format_reward_step": 0.96875, "step": 151 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5045454545454545, "calib/avg_num_step_conf": 9.4921875, "calib/ece": 0.4369918699186992, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003636363636365658, "calib/mean_conf": 0.9898373983739838, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9896363636363635, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4369918699186992, "calib/std_conf": 0.002545118023170494, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9398222222222222, "calib/step_q_c_n": 1350.0, "calib/step_q_gap": 0.00441481481481476, "calib/step_q_w": 0.9354074074074075, "calib/step_q_w_n": 1080.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2570.0, "completions/max_terminated_length": 2570.0, "completions/mean_length": 859.7890625, "completions/mean_terminated_length": 883.9597778320312, "completions/min_length": 0.0, "completions/min_terminated_length": 551.0, "epoch": 0.16213333333333332, "grad_norm": 0.005733448546379805, "kl": 0.05826568603515625, "learning_rate": 1.3333333333333334e-06, "loss": -0.0428, "num_tokens": 49904408.0, "reward": 0.7376753091812134, "reward_std": 0.24671050906181335, "rewards/accuracy_reward_step": 0.53125, "rewards/asymmetric_l2_reward": 0.6415493488311768, "rewards/final_brier_reward_step": 0.5361449122428894, "rewards/format_reward_step": 0.95703125, "step": 152 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.51953125, "calib/ece": 0.3540000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3540000000000001, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9335677276091784, "calib/step_q_c_n": 1351.0, "calib/step_q_gap": 0.00018218544050363494, "calib/step_q_w": 0.9333855421686748, "calib/step_q_w_n": 830.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2805.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 865.0390625, "completions/mean_terminated_length": 875.2964477539062, "completions/min_length": 0.0, "completions/min_terminated_length": 579.0, "epoch": 0.1632, "grad_norm": 0.005814076401293278, "kl": 0.055614471435546875, "learning_rate": 1.3055555555555556e-06, "loss": 0.0202, "num_tokens": 50233178.0, "reward": 0.8162698745727539, "reward_std": 0.2791767120361328, "rewards/accuracy_reward_step": 0.62109375, "rewards/asymmetric_l2_reward": 0.6849030256271362, "rewards/final_brier_reward_step": 0.6281054019927979, "rewards/format_reward_step": 0.9765625, "step": 153 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5020976353928299, "calib/avg_num_step_conf": 9.01171875, "calib/ece": 0.4425793650793649, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 4.195270785656291e-05, "calib/mean_conf": 0.9901984126984126, "calib/mu_c": 0.9902173913043477, "calib/mu_w": 0.9901754385964912, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4425793650793649, "calib/std_conf": 0.001394546300857587, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9370293209876543, "calib/step_q_c_n": 1296.0, "calib/step_q_gap": 0.0033695781587720353, "calib/step_q_w": 0.9336597428288823, "calib/step_q_w_n": 1011.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 815.5703125, "completions/mean_terminated_length": 828.5159301757812, "completions/min_length": 0.0, "completions/min_terminated_length": 492.0, "epoch": 0.16426666666666667, "grad_norm": 0.006068110000342131, "kl": 0.060028076171875, "learning_rate": 1.2777777777777779e-06, "loss": -0.0165, "num_tokens": 50546404.0, "reward": 0.750443696975708, "reward_std": 0.22490191459655762, "rewards/accuracy_reward_step": 0.5390625, "rewards/asymmetric_l2_reward": 0.6540299654006958, "rewards/final_brier_reward_step": 0.5437324047088623, "rewards/format_reward_step": 0.9765625, "step": 154 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.49916634603052457, "calib/avg_num_step_conf": 8.2578125, "calib/ece": 0.44027888446215147, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.6031807105409968e-05, "calib/mean_conf": 0.9900796812749004, "calib/mu_c": 0.9900724637681159, "calib/mu_w": 0.9900884955752213, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.44027888446215147, "calib/std_conf": 0.0012598715777563277, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9308571428571428, "calib/step_q_c_n": 1190.0, "calib/step_q_gap": 0.0027402597402598206, "calib/step_q_w": 0.928116883116883, "calib/step_q_w_n": 924.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 795.71875, "completions/mean_terminated_length": 811.5697631835938, "completions/min_length": 0.0, "completions/min_terminated_length": 537.0, "epoch": 0.16533333333333333, "grad_norm": 0.005755608901381493, "kl": 0.0548095703125, "learning_rate": 1.25e-06, "loss": -0.0586, "num_tokens": 50857324.0, "reward": 0.7406020760536194, "reward_std": 0.23273462057113647, "rewards/accuracy_reward_step": 0.5390625, "rewards/asymmetric_l2_reward": 0.6313012838363647, "rewards/final_brier_reward_step": 0.5475590229034424, "rewards/format_reward_step": 0.97265625, "step": 155 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5070422535211268, "calib/avg_num_step_conf": 8.2421875, "calib/ece": 0.4310236220472441, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001408450704224462, "calib/mean_conf": 0.9900787401574803, "calib/mu_c": 0.9901408450704225, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4310236220472441, "calib/std_conf": 0.001252438875636762, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9351894818252126, "calib/step_q_c_n": 1293.0, "calib/step_q_gap": 0.014822284762789018, "calib/step_q_w": 0.9203671970624235, "calib/step_q_w_n": 817.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2409.0, "completions/max_terminated_length": 2409.0, "completions/mean_length": 843.8125, "completions/mean_terminated_length": 847.1216430664062, "completions/min_length": 0.0, "completions/min_terminated_length": 490.0, "epoch": 0.1664, "grad_norm": 0.005306870210915804, "kl": 0.051036834716796875, "learning_rate": 1.2222222222222223e-06, "loss": 0.0101, "num_tokens": 51178100.0, "reward": 0.7855024933815002, "reward_std": 0.25378745794296265, "rewards/accuracy_reward_step": 0.5546875, "rewards/asymmetric_l2_reward": 0.7000096440315247, "rewards/final_brier_reward_step": 0.5631827712059021, "rewards/format_reward_step": 0.984375, "step": 156 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5118317581704062, "calib/avg_num_step_conf": 8.30078125, "calib/ece": 0.32208000000000014, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00048048481350548844, "calib/mean_conf": 0.9900800000000002, "calib/mu_c": 0.9902395209580837, "calib/mu_w": 0.9897590361445782, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.32208000000000014, "calib/std_conf": 0.002365079279855119, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9310576271186441, "calib/step_q_c_n": 1475.0, "calib/step_q_gap": 0.00818070404172111, "calib/step_q_w": 0.922876923076923, "calib/step_q_w_n": 650.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2060.0, "completions/max_terminated_length": 2060.0, "completions/mean_length": 822.86328125, "completions/mean_terminated_length": 839.2550048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 513.0, "epoch": 0.16746666666666668, "grad_norm": 0.005075718276202679, "kl": 0.0534820556640625, "learning_rate": 1.1944444444444446e-06, "loss": -0.0414, "num_tokens": 51492481.0, "reward": 0.8595537543296814, "reward_std": 0.16089923679828644, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.7353023886680603, "rewards/final_brier_reward_step": 0.6588050723075867, "rewards/format_reward_step": 0.97265625, "step": 157 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5083156639279088, "calib/avg_num_step_conf": 8.12890625, "calib/ece": 0.37880952380952393, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00037105751391453, "calib/mean_conf": 0.9899206349206351, "calib/mu_c": 0.9900649350649349, "calib/mu_w": 0.9896938775510203, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37880952380952393, "calib/std_conf": 0.001990466064124518, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9337134502923978, "calib/step_q_c_n": 1368.0, "calib/step_q_gap": 0.013082314247516935, "calib/step_q_w": 0.9206311360448809, "calib/step_q_w_n": 713.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2763.0, "completions/max_terminated_length": 2763.0, "completions/mean_length": 823.91796875, "completions/mean_terminated_length": 830.405517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 518.0, "epoch": 0.16853333333333334, "grad_norm": 0.006082612089812756, "kl": 0.05602264404296875, "learning_rate": 1.1666666666666668e-06, "loss": 0.019, "num_tokens": 51808644.0, "reward": 0.8030421733856201, "reward_std": 0.2842303514480591, "rewards/accuracy_reward_step": 0.6015625, "rewards/asymmetric_l2_reward": 0.6795475482940674, "rewards/final_brier_reward_step": 0.609349250793457, "rewards/format_reward_step": 0.984375, "step": 158 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5009805910597146, "calib/avg_num_step_conf": 8.23828125, "calib/ece": 0.35916269841269854, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.901196997351363e-05, "calib/mean_conf": 0.9901150793650795, "calib/mu_c": 0.9901257861635216, "calib/mu_w": 0.9900967741935481, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.35916269841269854, "calib/std_conf": 0.0010496838118273031, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.9312385643912737, "calib/step_q_c_n": 1421.0, "calib/step_q_gap": 0.011408622530808543, "calib/step_q_w": 0.9198299418604652, "calib/step_q_w_n": 688.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2796.0, "completions/max_terminated_length": 2796.0, "completions/mean_length": 792.65625, "completions/mean_terminated_length": 798.8976440429688, "completions/min_length": 0.0, "completions/min_terminated_length": 566.0, "epoch": 0.1696, "grad_norm": 0.005457705818116665, "kl": 0.049915313720703125, "learning_rate": 1.138888888888889e-06, "loss": -0.0069, "num_tokens": 52116348.0, "reward": 0.845173180103302, "reward_std": 0.22259722650051117, "rewards/accuracy_reward_step": 0.62109375, "rewards/asymmetric_l2_reward": 0.7504348754882812, "rewards/final_brier_reward_step": 0.6203800439834595, "rewards/format_reward_step": 0.9765625, "step": 159 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5054326610279767, "calib/avg_num_step_conf": 7.8515625, "calib/ece": 0.41247011952191226, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00011255692908251902, "calib/mean_conf": 0.9901593625498007, "calib/mu_c": 0.990206896551724, "calib/mu_w": 0.9900943396226415, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.41247011952191226, "calib/std_conf": 0.0017781604465674436, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.9329748822605967, "calib/step_q_c_n": 1274.0, "calib/step_q_gap": 0.01552923008668361, "calib/step_q_w": 0.917445652173913, "calib/step_q_w_n": 736.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2869.0, "completions/max_terminated_length": 2869.0, "completions/mean_length": 856.1015625, "completions/mean_terminated_length": 859.4588623046875, "completions/min_length": 0.0, "completions/min_terminated_length": 560.0, "epoch": 0.17066666666666666, "grad_norm": 0.005602055694907904, "kl": 0.04767608642578125, "learning_rate": 1.111111111111111e-06, "loss": 0.0147, "num_tokens": 52440350.0, "reward": 0.7861953973770142, "reward_std": 0.2470768243074417, "rewards/accuracy_reward_step": 0.56640625, "rewards/asymmetric_l2_reward": 0.6940505504608154, "rewards/final_brier_reward_step": 0.5705277323722839, "rewards/format_reward_step": 0.97265625, "step": 160 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.500601829561868, "calib/avg_num_step_conf": 8.19921875, "calib/ece": 0.2549802371541503, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.2036591237607475e-05, "calib/mean_conf": 0.9901581027667985, "calib/mu_c": 0.9901612903225807, "calib/mu_w": 0.9901492537313431, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2549802371541503, "calib/std_conf": 0.001247409789569922, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9285210824417873, "calib/step_q_c_n": 1589.0, "calib/step_q_gap": 0.009305396167277546, "calib/step_q_w": 0.9192156862745098, "calib/step_q_w_n": 510.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2497.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 786.68359375, "completions/mean_terminated_length": 792.8779296875, "completions/min_length": 0.0, "completions/min_terminated_length": 433.0, "epoch": 0.17173333333333332, "grad_norm": 0.005685119424015284, "kl": 0.056888580322265625, "learning_rate": 1.0833333333333335e-06, "loss": 0.0248, "num_tokens": 52745661.0, "reward": 0.9298093318939209, "reward_std": 0.1548190861940384, "rewards/accuracy_reward_step": 0.7265625, "rewards/asymmetric_l2_reward": 0.785028338432312, "rewards/final_brier_reward_step": 0.7316214442253113, "rewards/format_reward_step": 0.98828125, "step": 161 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.512847790507365, "calib/avg_num_step_conf": 8.0078125, "calib/ece": 0.246798418972332, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0008756137479541959, "calib/mean_conf": 0.9898814229249012, "calib/mu_c": 0.9901063829787233, "calib/mu_w": 0.9892307692307691, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.246798418972332, "calib/std_conf": 0.0034984144843255077, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9292870427774333, "calib/step_q_c_n": 1613.0, "calib/step_q_gap": 0.014275601129836057, "calib/step_q_w": 0.9150114416475973, "calib/step_q_w_n": 437.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1588.0, "completions/max_terminated_length": 1588.0, "completions/mean_length": 774.3359375, "completions/mean_terminated_length": 783.517822265625, "completions/min_length": 0.0, "completions/min_terminated_length": 481.0, "epoch": 0.1728, "grad_norm": 0.00584648922085762, "kl": 0.0536041259765625, "learning_rate": 1.0555555555555557e-06, "loss": -0.0188, "num_tokens": 53048035.0, "reward": 0.9473156332969666, "reward_std": 0.21745626628398895, "rewards/accuracy_reward_step": 0.734375, "rewards/asymmetric_l2_reward": 0.8112285733222961, "rewards/final_brier_reward_step": 0.7396527528762817, "rewards/format_reward_step": 0.984375, "step": 162 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5099684168969602, "calib/avg_num_step_conf": 7.53515625, "calib/ece": 0.3964940239043825, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0002013422818791799, "calib/mean_conf": 0.9901195219123506, "calib/mu_c": 0.990201342281879, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3964940239043825, "calib/std_conf": 0.0014063237127269169, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9303259141494437, "calib/step_q_c_n": 1258.0, "calib/step_q_gap": 0.01974469209281171, "calib/step_q_w": 0.910581222056632, "calib/step_q_w_n": 671.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2831.0, "completions/max_terminated_length": 2831.0, "completions/mean_length": 826.18359375, "completions/mean_terminated_length": 835.9802856445312, "completions/min_length": 0.0, "completions/min_terminated_length": 428.0, "epoch": 0.17386666666666667, "grad_norm": 0.005138279404491186, "kl": 0.048274993896484375, "learning_rate": 1.0277777777777777e-06, "loss": -0.0275, "num_tokens": 53364370.0, "reward": 0.8028818368911743, "reward_std": 0.21207211911678314, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.7033613920211792, "rewards/final_brier_reward_step": 0.5899022817611694, "rewards/format_reward_step": 0.98046875, "step": 163 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.508445945945946, "calib/avg_num_step_conf": 6.8828125, "calib/ece": 0.38829268292682917, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003736900165469681, "calib/mean_conf": 0.9899186991869918, "calib/mu_c": 0.9900675675675674, "calib/mu_w": 0.9896938775510205, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.38829268292682917, "calib/std_conf": 0.002014554746888435, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9237198929527208, "calib/step_q_c_n": 1121.0, "calib/step_q_gap": 0.01459352789811863, "calib/step_q_w": 0.9091263650546022, "calib/step_q_w_n": 641.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1631.0, "completions/max_terminated_length": 1631.0, "completions/mean_length": 794.96875, "completions/mean_terminated_length": 820.6128540039062, "completions/min_length": 0.0, "completions/min_terminated_length": 544.0, "epoch": 0.17493333333333333, "grad_norm": 0.005803633015602827, "kl": 0.04936981201171875, "learning_rate": 1.0000000000000002e-06, "loss": -0.0688, "num_tokens": 53674018.0, "reward": 0.7884545922279358, "reward_std": 0.2159702479839325, "rewards/accuracy_reward_step": 0.578125, "rewards/asymmetric_l2_reward": 0.6831825971603394, "rewards/final_brier_reward_step": 0.5859140157699585, "rewards/format_reward_step": 0.9609375, "step": 164 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5034391534391535, "calib/avg_num_step_conf": 7.53515625, "calib/ece": 0.4117269076305222, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 6.94444444444553e-05, "calib/mean_conf": 0.9900401606425704, "calib/mu_c": 0.9900694444444444, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4117269076305222, "calib/std_conf": 0.0010969076533130662, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9288812980358668, "calib/step_q_c_n": 1171.0, "calib/step_q_gap": 0.010754648959349633, "calib/step_q_w": 0.9181266490765172, "calib/step_q_w_n": 758.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 816.93359375, "completions/mean_terminated_length": 826.62060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 505.0, "epoch": 0.176, "grad_norm": 0.005772776901721954, "kl": 0.047878265380859375, "learning_rate": 9.722222222222224e-07, "loss": 0.0007, "num_tokens": 53988729.0, "reward": 0.7707719802856445, "reward_std": 0.2285512387752533, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.6656252145767212, "rewards/final_brier_reward_step": 0.570449948310852, "rewards/format_reward_step": 0.96484375, "step": 165 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5097144194756553, "calib/avg_num_step_conf": 7.55078125, "calib/ece": 0.27800000000000014, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00019506866416951762, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9900561797752808, "calib/mu_w": 0.9898611111111113, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27800000000000014, "calib/std_conf": 0.0008944271909999167, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9282830315224683, "calib/step_q_c_n": 1491.0, "calib/step_q_gap": 0.02353190030074892, "calib/step_q_w": 0.9047511312217194, "calib/step_q_w_n": 442.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2352.0, "completions/max_terminated_length": 2352.0, "completions/mean_length": 811.90234375, "completions/mean_terminated_length": 824.7897338867188, "completions/min_length": 0.0, "completions/min_terminated_length": 505.0, "epoch": 0.17706666666666668, "grad_norm": 0.005784960929304361, "kl": 0.05084228515625, "learning_rate": 9.444444444444445e-07, "loss": -0.0096, "num_tokens": 54302760.0, "reward": 0.9043484330177307, "reward_std": 0.19850796461105347, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.7734046578407288, "rewards/final_brier_reward_step": 0.7009171843528748, "rewards/format_reward_step": 0.9765625, "step": 166 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5173226164079823, "calib/avg_num_step_conf": 7.82421875, "calib/ece": 0.339202380952381, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003431263858091338, "calib/mean_conf": 0.9899960317460318, "calib/mu_c": 0.9901158536585365, "calib/mu_w": 0.9897727272727274, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.339202380952381, "calib/std_conf": 0.0012295889617531574, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.928510082150859, "calib/step_q_c_n": 1339.0, "calib/step_q_gap": 0.00506128697013597, "calib/step_q_w": 0.923448795180723, "calib/step_q_w_n": 664.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2766.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 806.5859375, "completions/mean_terminated_length": 816.1502075195312, "completions/min_length": 0.0, "completions/min_terminated_length": 476.0, "epoch": 0.17813333333333334, "grad_norm": 0.005531142000108957, "kl": 0.052581787109375, "learning_rate": 9.166666666666666e-07, "loss": 0.005, "num_tokens": 54614854.0, "reward": 0.8472436666488647, "reward_std": 0.23025396466255188, "rewards/accuracy_reward_step": 0.640625, "rewards/asymmetric_l2_reward": 0.722790002822876, "rewards/final_brier_reward_step": 0.647478461265564, "rewards/format_reward_step": 0.98046875, "step": 167 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.0234375, "calib/ece": 0.2864426877470355, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.220446049250313e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2864426877470355, "calib/std_conf": 0.000889108448948775, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9222430607651912, "calib/step_q_c_n": 1333.0, "calib/step_q_gap": 0.022200050012503136, "calib/step_q_w": 0.9000430107526881, "calib/step_q_w_n": 465.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1629.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 830.3984375, "completions/mean_terminated_length": 840.2451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 524.0, "epoch": 0.1792, "grad_norm": 0.006028932053595781, "kl": 0.04668426513671875, "learning_rate": 8.88888888888889e-07, "loss": 0.0018, "num_tokens": 54932108.0, "reward": 0.8952298164367676, "reward_std": 0.20066331326961517, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.752668559551239, "rewards/final_brier_reward_step": 0.7010722160339355, "rewards/format_reward_step": 0.98828125, "step": 168 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.91796875, "calib/ece": 0.3892094861660079, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3892094861660079, "calib/std_conf": 0.000889108448948775, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9278112609040444, "calib/step_q_c_n": 1261.0, "calib/step_q_gap": 0.01910368910247795, "calib/step_q_w": 0.9087075718015665, "calib/step_q_w_n": 766.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 823.63671875, "completions/mean_terminated_length": 830.1220703125, "completions/min_length": 0.0, "completions/min_terminated_length": 541.0, "epoch": 0.18026666666666666, "grad_norm": 0.005781630985438824, "kl": 0.04717254638671875, "learning_rate": 8.611111111111112e-07, "loss": 0.018, "num_tokens": 55247143.0, "reward": 0.8189793825149536, "reward_std": 0.17103320360183716, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.7200114727020264, "rewards/final_brier_reward_step": 0.6015409827232361, "rewards/format_reward_step": 0.98828125, "step": 169 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.511520737327189, "calib/avg_num_step_conf": 7.6484375, "calib/ece": 0.3561811023622047, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0002317504842052065, "calib/mean_conf": 0.9900393700787401, "calib/mu_c": 0.9901242236024846, "calib/mu_w": 0.9898924731182794, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3561811023622047, "calib/std_conf": 0.0010860719861522626, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9301465201465203, "calib/step_q_c_n": 1365.0, "calib/step_q_gap": 0.014278054716503452, "calib/step_q_w": 0.9158684654300169, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2043.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 824.89453125, "completions/mean_terminated_length": 828.1294555664062, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.18133333333333335, "grad_norm": 0.14566980302333832, "kl": 0.09648895263671875, "learning_rate": 8.333333333333333e-07, "loss": -0.0184, "num_tokens": 55562468.0, "reward": 0.8437641263008118, "reward_std": 0.2100234031677246, "rewards/accuracy_reward_step": 0.62890625, "rewards/asymmetric_l2_reward": 0.7280181050300598, "rewards/final_brier_reward_step": 0.6360726356506348, "rewards/format_reward_step": 0.98828125, "step": 170 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.49199641439364833, "calib/avg_num_step_conf": 7.859375, "calib/ece": 0.44418326693227095, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001607119989756045, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899270072992701, "calib/mu_w": 0.9900877192982457, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.44418326693227095, "calib/std_conf": 0.0008926436853549044, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9300573300573299, "calib/step_q_c_n": 1221.0, "calib/step_q_gap": 0.015556697946078168, "calib/step_q_w": 0.9145006321112518, "calib/step_q_w_n": 791.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 787.0546875, "completions/mean_terminated_length": 799.5476684570312, "completions/min_length": 0.0, "completions/min_terminated_length": 494.0, "epoch": 0.1824, "grad_norm": 0.006573774851858616, "kl": 0.046173095703125, "learning_rate": 8.055555555555557e-07, "loss": -0.0112, "num_tokens": 55870850.0, "reward": 0.7430707216262817, "reward_std": 0.2690494954586029, "rewards/accuracy_reward_step": 0.53515625, "rewards/asymmetric_l2_reward": 0.6438180208206177, "rewards/final_brier_reward_step": 0.5399796962738037, "rewards/format_reward_step": 0.9765625, "step": 171 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4972972972972973, "calib/avg_num_step_conf": 8.41015625, "calib/ece": 0.2586561264822135, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00016216216216202728, "calib/mean_conf": 0.9898814229249012, "calib/mu_c": 0.9898378378378379, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2586561264822135, "calib/std_conf": 0.0018823527114293543, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.933501213592233, "calib/step_q_c_n": 1648.0, "calib/step_q_gap": 0.011223985869460718, "calib/step_q_w": 0.9222772277227723, "calib/step_q_w_n": 505.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2312.0, "completions/max_terminated_length": 2312.0, "completions/mean_length": 815.2265625, "completions/mean_terminated_length": 821.6456909179688, "completions/min_length": 0.0, "completions/min_terminated_length": 562.0, "epoch": 0.18346666666666667, "grad_norm": 0.006808638572692871, "kl": 0.05686187744140625, "learning_rate": 7.777777777777779e-07, "loss": -0.0047, "num_tokens": 56182900.0, "reward": 0.9279036521911621, "reward_std": 0.18457263708114624, "rewards/accuracy_reward_step": 0.72265625, "rewards/asymmetric_l2_reward": 0.7857556343078613, "rewards/final_brier_reward_step": 0.7278640270233154, "rewards/format_reward_step": 0.98828125, "step": 172 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.49678574037548395, "calib/avg_num_step_conf": 8.375, "calib/ece": 0.31408, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -6.428519249035425e-05, "calib/mean_conf": 0.9900800000000001, "calib/mu_c": 0.9900591715976331, "calib/mu_w": 0.9901234567901235, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.31408, "calib/std_conf": 0.0008908422980528043, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.9305886475122636, "calib/step_q_c_n": 1427.0, "calib/step_q_gap": 0.002834114736810167, "calib/step_q_w": 0.9277545327754534, "calib/step_q_w_n": 717.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2817.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 838.22265625, "completions/mean_terminated_length": 844.8228149414062, "completions/min_length": 0.0, "completions/min_terminated_length": 457.0, "epoch": 0.18453333333333333, "grad_norm": 0.0054668583907186985, "kl": 0.051910400390625, "learning_rate": 7.5e-07, "loss": 0.0027, "num_tokens": 56500645.0, "reward": 0.8591402769088745, "reward_std": 0.2172548919916153, "rewards/accuracy_reward_step": 0.6640625, "rewards/asymmetric_l2_reward": 0.7247052192687988, "rewards/final_brier_reward_step": 0.666231632232666, "rewards/format_reward_step": 0.97265625, "step": 173 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4963296629963297, "calib/avg_num_step_conf": 7.8203125, "calib/ece": 0.44117886178861787, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -7.407407407389321e-05, "calib/mean_conf": 0.989959349593496, "calib/mu_c": 0.9899259259259259, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44117886178861787, "calib/std_conf": 0.0011035668260567673, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9252323420074351, "calib/step_q_c_n": 1076.0, "calib/step_q_gap": 0.002132989955599296, "calib/step_q_w": 0.9230993520518358, "calib/step_q_w_n": 926.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 890.26953125, "completions/mean_terminated_length": 904.4008178710938, "completions/min_length": 0.0, "completions/min_terminated_length": 549.0, "epoch": 0.1856, "grad_norm": 0.00568380206823349, "kl": 0.0473785400390625, "learning_rate": 7.222222222222222e-07, "loss": -0.0077, "num_tokens": 56832786.0, "reward": 0.7364821434020996, "reward_std": 0.29423683881759644, "rewards/accuracy_reward_step": 0.52734375, "rewards/asymmetric_l2_reward": 0.6393903493881226, "rewards/final_brier_reward_step": 0.5359175205230713, "rewards/format_reward_step": 0.9609375, "step": 174 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5044642857142857, "calib/avg_num_step_conf": 7.99609375, "calib/ece": 0.5420400000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 8.92857142859027e-05, "calib/mean_conf": 0.99004, "calib/mu_c": 0.9900892857142859, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.5420400000000001, "calib/std_conf": 0.0006311893535223806, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.9312725450901804, "calib/step_q_c_n": 998.0, "calib/step_q_gap": 0.007669113250332926, "calib/step_q_w": 0.9236034318398475, "calib/step_q_w_n": 1049.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2381.0, "completions/max_terminated_length": 2381.0, "completions/mean_length": 828.3046875, "completions/mean_terminated_length": 841.4524536132812, "completions/min_length": 0.0, "completions/min_terminated_length": 472.0, "epoch": 0.18666666666666668, "grad_norm": 0.00572630763053894, "kl": 0.0507659912109375, "learning_rate": 6.944444444444446e-07, "loss": -0.0266, "num_tokens": 57150656.0, "reward": 0.6720933318138123, "reward_std": 0.26687222719192505, "rewards/accuracy_reward_step": 0.4375, "rewards/asymmetric_l2_reward": 0.618736207485199, "rewards/final_brier_reward_step": 0.444200336933136, "rewards/format_reward_step": 0.96875, "step": 175 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5012851731601731, "calib/avg_num_step_conf": 8.3828125, "calib/ece": 0.3741200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.5703463203297083e-05, "calib/mean_conf": 0.9901200000000001, "calib/mu_c": 0.9901298701298701, "calib/mu_w": 0.9901041666666668, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3741200000000001, "calib/std_conf": 0.001088852607105297, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9325920577617329, "calib/step_q_c_n": 1385.0, "calib/step_q_gap": 0.009280625435845957, "calib/step_q_w": 0.923311432325887, "calib/step_q_w_n": 761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2884.0, "completions/max_terminated_length": 2884.0, "completions/mean_length": 824.0625, "completions/mean_terminated_length": 833.8340454101562, "completions/min_length": 0.0, "completions/min_terminated_length": 522.0, "epoch": 0.18773333333333334, "grad_norm": 0.005427682306617498, "kl": 0.047412872314453125, "learning_rate": 6.666666666666667e-07, "loss": -0.0152, "num_tokens": 57465680.0, "reward": 0.8262530565261841, "reward_std": 0.2361697107553482, "rewards/accuracy_reward_step": 0.6015625, "rewards/asymmetric_l2_reward": 0.733539342880249, "rewards/final_brier_reward_step": 0.604904294013977, "rewards/format_reward_step": 0.96875, "step": 176 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5104166666666667, "calib/avg_num_step_conf": 8.49609375, "calib/ece": 0.36639215686274507, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00020833333333303283, "calib/mean_conf": 0.989921568627451, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9897916666666667, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36639215686274507, "calib/std_conf": 0.0008821350493491767, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9363442940038684, "calib/step_q_c_n": 1551.0, "calib/step_q_gap": 0.024469294003868436, "calib/step_q_w": 0.911875, "calib/step_q_w_n": 624.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1973.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 828.94921875, "completions/mean_terminated_length": 828.94921875, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.1888, "grad_norm": 0.006155412178486586, "kl": 0.05728912353515625, "learning_rate": 6.388888888888889e-07, "loss": -0.0037, "num_tokens": 57781723.0, "reward": 0.8317885398864746, "reward_std": 0.20242759585380554, "rewards/accuracy_reward_step": 0.62109375, "rewards/asymmetric_l2_reward": 0.7114914655685425, "rewards/final_brier_reward_step": 0.6286479830741882, "rewards/format_reward_step": 0.99609375, "step": 177 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.493632504548211, "calib/avg_num_step_conf": 8.33984375, "calib/ece": 0.3778800000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.7626170743255685e-05, "calib/mean_conf": 0.9898800000000001, "calib/mu_c": 0.9898692810457516, "calib/mu_w": 0.9898969072164948, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3778800000000001, "calib/std_conf": 0.0018935680605671416, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9328140013726837, "calib/step_q_c_n": 1457.0, "calib/step_q_gap": 0.019878898127845823, "calib/step_q_w": 0.9129351032448378, "calib/step_q_w_n": 678.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2758.0, "completions/max_terminated_length": 2758.0, "completions/mean_length": 804.12109375, "completions/mean_terminated_length": 823.4200439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 535.0, "epoch": 0.18986666666666666, "grad_norm": 0.005673910956829786, "kl": 0.0516510009765625, "learning_rate": 6.111111111111112e-07, "loss": -0.032, "num_tokens": 58093650.0, "reward": 0.8042181134223938, "reward_std": 0.26732301712036133, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.6892424821853638, "rewards/final_brier_reward_step": 0.6051312685012817, "rewards/format_reward_step": 0.97265625, "step": 178 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5032679738562091, "calib/avg_num_step_conf": 8.484375, "calib/ece": 0.37804000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 6.535947712427159e-05, "calib/mean_conf": 0.99004, "calib/mu_c": 0.9900653594771243, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.37804000000000004, "calib/std_conf": 0.0006311893535223806, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9356678200692042, "calib/step_q_c_n": 1445.0, "calib/step_q_gap": 0.022737971375944244, "calib/step_q_w": 0.91292984869326, "calib/step_q_w_n": 727.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 834.37109375, "completions/mean_terminated_length": 847.6151123046875, "completions/min_length": 0.0, "completions/min_terminated_length": 526.0, "epoch": 0.19093333333333334, "grad_norm": 0.005824382416903973, "kl": 0.055938720703125, "learning_rate": 5.833333333333334e-07, "loss": -0.0158, "num_tokens": 58413513.0, "reward": 0.8095970153808594, "reward_std": 0.23507192730903625, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.7009310722351074, "rewards/final_brier_reward_step": 0.6049816012382507, "rewards/format_reward_step": 0.96875, "step": 179 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.1328125, "calib/ece": 0.37095238095238103, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.99, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37095238095238103, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9301554770318021, "calib/step_q_c_n": 1415.0, "calib/step_q_gap": 0.01328891031516033, "calib/step_q_w": 0.9168665667166418, "calib/step_q_w_n": 667.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 851.38671875, "completions/mean_terminated_length": 864.9008178710938, "completions/min_length": 0.0, "completions/min_terminated_length": 484.0, "epoch": 0.192, "grad_norm": 0.00597095163539052, "kl": 0.0483551025390625, "learning_rate": 5.555555555555555e-07, "loss": -0.0073, "num_tokens": 58735324.0, "reward": 0.8375976085662842, "reward_std": 0.21114030480384827, "rewards/accuracy_reward_step": 0.609375, "rewards/asymmetric_l2_reward": 0.7396686673164368, "rewards/final_brier_reward_step": 0.6167765855789185, "rewards/format_reward_step": 0.984375, "step": 180 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.49568965517241376, "calib/avg_num_step_conf": 8.515625, "calib/ece": 0.4449411764705882, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -8.620689655181035e-05, "calib/mean_conf": 0.9900392156862744, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9900862068965517, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4449411764705882, "calib/std_conf": 0.0010839431342027658, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9334858044164037, "calib/step_q_c_n": 1268.0, "calib/step_q_gap": 0.0100647517848248, "calib/step_q_w": 0.9234210526315789, "calib/step_q_w_n": 912.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2102.0, "completions/max_terminated_length": 2102.0, "completions/mean_length": 815.78125, "completions/mean_terminated_length": 818.98046875, "completions/min_length": 0.0, "completions/min_terminated_length": 491.0, "epoch": 0.19306666666666666, "grad_norm": 0.006558713968843222, "kl": 0.0558013916015625, "learning_rate": 5.277777777777779e-07, "loss": 0.0032, "num_tokens": 59050428.0, "reward": 0.7611607313156128, "reward_std": 0.2021130919456482, "rewards/accuracy_reward_step": 0.54296875, "rewards/asymmetric_l2_reward": 0.6626558899879456, "rewards/final_brier_reward_step": 0.5518530607223511, "rewards/format_reward_step": 0.99609375, "step": 181 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.49774865591397854, "calib/avg_num_step_conf": 9.03515625, "calib/ece": 0.35766798418972334, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -4.502688171992286e-05, "calib/mean_conf": 0.9900790513833992, "calib/mu_c": 0.9900625000000002, "calib/mu_w": 0.9901075268817201, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35766798418972334, "calib/std_conf": 0.0008855872135339169, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9366816431322209, "calib/step_q_c_n": 1558.0, "calib/step_q_gap": 0.014006146443479195, "calib/step_q_w": 0.9226754966887417, "calib/step_q_w_n": 755.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1666.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 819.6796875, "completions/mean_terminated_length": 829.3992309570312, "completions/min_length": 0.0, "completions/min_terminated_length": 575.0, "epoch": 0.19413333333333332, "grad_norm": 0.0055109853856265545, "kl": 0.0526275634765625, "learning_rate": 5.000000000000001e-07, "loss": -0.0121, "num_tokens": 59366426.0, "reward": 0.818576455116272, "reward_std": 0.22150349617004395, "rewards/accuracy_reward_step": 0.625, "rewards/asymmetric_l2_reward": 0.6832661628723145, "rewards/final_brier_reward_step": 0.6320116519927979, "rewards/format_reward_step": 0.984375, "step": 182 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5013616557734205, "calib/avg_num_step_conf": 8.35546875, "calib/ece": 0.3756626506024098, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.655228758163819e-05, "calib/mean_conf": 0.990120481927711, "calib/mu_c": 0.9901307189542484, "calib/mu_w": 0.9901041666666668, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3756626506024098, "calib/std_conf": 0.001411919343875258, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9352155771905425, "calib/step_q_c_n": 1438.0, "calib/step_q_gap": 0.019224136391683855, "calib/step_q_w": 0.9159914407988586, "calib/step_q_w_n": 701.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2902.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 866.56640625, "completions/mean_terminated_length": 880.3214721679688, "completions/min_length": 0.0, "completions/min_terminated_length": 557.0, "epoch": 0.1952, "grad_norm": 0.00530533492565155, "kl": 0.05149078369140625, "learning_rate": 4.7222222222222226e-07, "loss": -0.014, "num_tokens": 59694947.0, "reward": 0.8005015850067139, "reward_std": 0.2967371344566345, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.6819595098495483, "rewards/final_brier_reward_step": 0.6049812436103821, "rewards/format_reward_step": 0.97265625, "step": 183 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.503170664460987, "calib/avg_num_step_conf": 8.55078125, "calib/ece": 0.3635341365461848, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 6.41025641027193e-05, "calib/mean_conf": 0.9900401606425704, "calib/mu_c": 0.9900641025641025, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3635341365461848, "calib/std_conf": 0.0010969076533130662, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9356889204545457, "calib/step_q_c_n": 1408.0, "calib/step_q_gap": 0.009888664372599498, "calib/step_q_w": 0.9258002560819462, "calib/step_q_w_n": 781.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1398.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 827.08203125, "completions/mean_terminated_length": 850.3333129882812, "completions/min_length": 0.0, "completions/min_terminated_length": 585.0, "epoch": 0.19626666666666667, "grad_norm": 0.005826389882713556, "kl": 0.0535430908203125, "learning_rate": 4.444444444444445e-07, "loss": -0.0479, "num_tokens": 60011960.0, "reward": 0.821940541267395, "reward_std": 0.22585231065750122, "rewards/accuracy_reward_step": 0.609375, "rewards/asymmetric_l2_reward": 0.7109317779541016, "rewards/final_brier_reward_step": 0.6165429353713989, "rewards/format_reward_step": 0.97265625, "step": 184 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5150070126227209, "calib/avg_num_step_conf": 8.953125, "calib/ece": 0.36255060728744926, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003022440392704828, "calib/mean_conf": 0.9900809716599189, "calib/mu_c": 0.9901935483870965, "calib/mu_w": 0.989891304347826, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.36255060728744926, "calib/std_conf": 0.001269990861648432, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.9383408360128617, "calib/step_q_c_n": 1555.0, "calib/step_q_gap": 0.01623771525302442, "calib/step_q_w": 0.9221031207598372, "calib/step_q_w_n": 737.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2964.0, "completions/max_terminated_length": 2964.0, "completions/mean_length": 853.453125, "completions/mean_terminated_length": 870.4542236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 84.0, "epoch": 0.19733333333333333, "grad_norm": 9127.7880859375, "kl": 41984.04993057251, "learning_rate": 4.1666666666666667e-07, "loss": 419.4765, "num_tokens": 60337364.0, "reward": 0.7966436147689819, "reward_std": 0.2328513264656067, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.6721329689025879, "rewards/final_brier_reward_step": 0.608654260635376, "rewards/format_reward_step": 0.95703125, "step": 185 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5034246575342466, "calib/avg_num_step_conf": 8.96875, "calib/ece": 0.4083665338645418, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 6.849315068480077e-05, "calib/mean_conf": 0.9900398406374502, "calib/mu_c": 0.9900684931506848, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4083665338645418, "calib/std_conf": 0.0006299357888781636, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.938288770053476, "calib/step_q_c_n": 1496.0, "calib/step_q_gap": 0.015901270053475858, "calib/step_q_w": 0.9223875000000001, "calib/step_q_w_n": 800.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 824.5703125, "completions/mean_terminated_length": 840.9960327148438, "completions/min_length": 0.0, "completions/min_terminated_length": 501.0, "epoch": 0.1984, "grad_norm": 0.005916334688663483, "kl": 0.0541534423828125, "learning_rate": 3.8888888888888895e-07, "loss": -0.0308, "num_tokens": 60653494.0, "reward": 0.7912920713424683, "reward_std": 0.23391059041023254, "rewards/accuracy_reward_step": 0.5703125, "rewards/asymmetric_l2_reward": 0.6940099000930786, "rewards/final_brier_reward_step": 0.578417956829071, "rewards/format_reward_step": 0.98046875, "step": 186 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5054662802950475, "calib/avg_num_step_conf": 8.98046875, "calib/ece": 0.4061600000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00010932560590093399, "calib/mean_conf": 0.99016, "calib/mu_c": 0.9902054794520547, "calib/mu_w": 0.9900961538461538, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4061600000000001, "calib/std_conf": 0.001254750971308651, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9334121122599706, "calib/step_q_c_n": 1354.0, "calib/step_q_gap": 0.010163435011293243, "calib/step_q_w": 0.9232486772486773, "calib/step_q_w_n": 945.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1979.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 862.390625, "completions/mean_terminated_length": 872.6166381835938, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.19946666666666665, "grad_norm": 0.005608111619949341, "kl": 0.0528106689453125, "learning_rate": 3.611111111111111e-07, "loss": -0.0251, "num_tokens": 60975810.0, "reward": 0.7882611751556396, "reward_std": 0.30219972133636475, "rewards/accuracy_reward_step": 0.5703125, "rewards/asymmetric_l2_reward": 0.6888840198516846, "rewards/final_brier_reward_step": 0.5782632827758789, "rewards/format_reward_step": 0.9765625, "step": 187 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5011998870694523, "calib/avg_num_step_conf": 8.52734375, "calib/ece": 0.3641016260162603, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.204404291368146e-05, "calib/mean_conf": 0.9901178861788619, "calib/mu_c": 0.9901298701298701, "calib/mu_w": 0.9900978260869564, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3641016260162603, "calib/std_conf": 0.001393336786882087, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9348505906879777, "calib/step_q_c_n": 1439.0, "calib/step_q_gap": 0.0140360745589454, "calib/step_q_w": 0.9208145161290323, "calib/step_q_w_n": 744.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2176.0, "completions/max_terminated_length": 2176.0, "completions/mean_length": 859.84765625, "completions/mean_terminated_length": 876.9761352539062, "completions/min_length": 0.0, "completions/min_terminated_length": 463.0, "epoch": 0.20053333333333334, "grad_norm": 0.0053206272423267365, "kl": 0.05606842041015625, "learning_rate": 3.3333333333333335e-07, "loss": -0.0046, "num_tokens": 61300003.0, "reward": 0.7975469827651978, "reward_std": 0.22008016705513, "rewards/accuracy_reward_step": 0.6015625, "rewards/asymmetric_l2_reward": 0.6786959767341614, "rewards/final_brier_reward_step": 0.6046792268753052, "rewards/format_reward_step": 0.95703125, "step": 188 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.59765625, "calib/ece": 0.42027888446215134, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.42027888446215134, "calib/std_conf": 0.0008926436853549044, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9347464042392128, "calib/step_q_c_n": 1321.0, "calib/step_q_gap": 0.00949640423921283, "calib/step_q_w": 0.92525, "calib/step_q_w_n": 880.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2781.0, "completions/max_terminated_length": 2781.0, "completions/mean_length": 836.6640625, "completions/mean_terminated_length": 846.5850219726562, "completions/min_length": 0.0, "completions/min_terminated_length": 520.0, "epoch": 0.2016, "grad_norm": 0.005423042923212051, "kl": 0.05489349365234375, "learning_rate": 3.055555555555556e-07, "loss": 0.0028, "num_tokens": 61621957.0, "reward": 0.7671196460723877, "reward_std": 0.25127676129341125, "rewards/accuracy_reward_step": 0.55859375, "rewards/asymmetric_l2_reward": 0.6641814112663269, "rewards/final_brier_reward_step": 0.5630265474319458, "rewards/format_reward_step": 0.9765625, "step": 189 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5020397756246813, "calib/avg_num_step_conf": 8.48828125, "calib/ece": 0.40744094488188976, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 4.079551249336344e-05, "calib/mean_conf": 0.9901181102362204, "calib/mu_c": 0.9901351351351348, "calib/mu_w": 0.9900943396226415, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40744094488188976, "calib/std_conf": 0.0010803482467726185, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.934004491017964, "calib/step_q_c_n": 1336.0, "calib/step_q_gap": 0.012140691734809916, "calib/step_q_w": 0.9218637992831541, "calib/step_q_w_n": 837.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2351.0, "completions/max_terminated_length": 2351.0, "completions/mean_length": 866.859375, "completions/mean_terminated_length": 873.68505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 502.0, "epoch": 0.20266666666666666, "grad_norm": 0.005208548624068499, "kl": 0.05187225341796875, "learning_rate": 2.7777777777777776e-07, "loss": -0.025, "num_tokens": 61949481.0, "reward": 0.7925498485565186, "reward_std": 0.19229495525360107, "rewards/accuracy_reward_step": 0.578125, "rewards/asymmetric_l2_reward": 0.6856661438941956, "rewards/final_brier_reward_step": 0.5861523151397705, "rewards/format_reward_step": 0.98828125, "step": 190 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5195272353545735, "calib/avg_num_step_conf": 9.359375, "calib/ece": 0.4362509960159364, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004033273381293112, "calib/mean_conf": 0.9900358565737053, "calib/mu_c": 0.9902158273381294, "calib/mu_w": 0.9898125000000001, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4362509960159364, "calib/std_conf": 0.001646773122345849, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9380392156862745, "calib/step_q_c_n": 1428.0, "calib/step_q_gap": 0.010193141306109044, "calib/step_q_w": 0.9278460743801654, "calib/step_q_w_n": 968.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2820.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 833.29296875, "completions/mean_terminated_length": 846.5198974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 561.0, "epoch": 0.20373333333333332, "grad_norm": 0.005339369177818298, "kl": 0.0585479736328125, "learning_rate": 2.5000000000000004e-07, "loss": -0.026, "num_tokens": 62266972.0, "reward": 0.746340811252594, "reward_std": 0.15258918702602386, "rewards/accuracy_reward_step": 0.54296875, "rewards/asymmetric_l2_reward": 0.6362113356590271, "rewards/final_brier_reward_step": 0.5517827868461609, "rewards/format_reward_step": 0.98046875, "step": 191 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5064236838430386, "calib/avg_num_step_conf": 8.4140625, "calib/ece": 0.3665991902834007, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001298701298703131, "calib/mean_conf": 0.9900809716599189, "calib/mu_c": 0.9901298701298701, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3665991902834007, "calib/std_conf": 0.0012699908616484322, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.936250852079073, "calib/step_q_c_n": 1467.0, "calib/step_q_gap": 0.01818680549974272, "calib/step_q_w": 0.9180640465793303, "calib/step_q_w_n": 687.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2709.0, "completions/max_terminated_length": 2709.0, "completions/mean_length": 837.3828125, "completions/mean_terminated_length": 864.3951416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 270.0, "epoch": 0.2048, "grad_norm": 0.0058121588081121445, "kl": 0.05686187744140625, "learning_rate": 2.2222222222222224e-07, "loss": -0.046, "num_tokens": 62586318.0, "reward": 0.7954423427581787, "reward_std": 0.2760905623435974, "rewards/accuracy_reward_step": 0.6015625, "rewards/asymmetric_l2_reward": 0.6782459616661072, "rewards/final_brier_reward_step": 0.6009199023246765, "rewards/format_reward_step": 0.95703125, "step": 192 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.49137931034482757, "calib/avg_num_step_conf": 8.49609375, "calib/ece": 0.4485770750988143, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00017241379310328764, "calib/mean_conf": 0.9900790513833992, "calib/mu_c": 0.99, "calib/mu_w": 0.9901724137931033, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4485770750988143, "calib/std_conf": 0.000885587213533917, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9332434640522875, "calib/step_q_c_n": 1224.0, "calib/step_q_gap": 0.005882791076472493, "calib/step_q_w": 0.927360672975815, "calib/step_q_w_n": 951.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 850.6171875, "completions/mean_terminated_length": 860.70361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 501.0, "epoch": 0.20586666666666667, "grad_norm": 0.005852544214576483, "kl": 0.05562591552734375, "learning_rate": 1.9444444444444447e-07, "loss": 0.0044, "num_tokens": 62909788.0, "reward": 0.7509139776229858, "reward_std": 0.2576588988304138, "rewards/accuracy_reward_step": 0.53515625, "rewards/asymmetric_l2_reward": 0.6531758904457092, "rewards/final_brier_reward_step": 0.5439644455909729, "rewards/format_reward_step": 0.98828125, "step": 193 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5127151799687011, "calib/avg_num_step_conf": 8.734375, "calib/ece": 0.4219600000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0002556077203963447, "calib/mean_conf": 0.9899600000000001, "calib/mu_c": 0.990070422535211, "calib/mu_w": 0.9898148148148147, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4219600000000001, "calib/std_conf": 0.0010947145746723218, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9353106508875739, "calib/step_q_c_n": 1352.0, "calib/step_q_gap": 0.016475809258614693, "calib/step_q_w": 0.9188348416289592, "calib/step_q_w_n": 884.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2210.0, "completions/max_terminated_length": 2210.0, "completions/mean_length": 812.16015625, "completions/mean_terminated_length": 828.3386840820312, "completions/min_length": 0.0, "completions/min_terminated_length": 492.0, "epoch": 0.20693333333333333, "grad_norm": 0.00551479822024703, "kl": 0.05739593505859375, "learning_rate": 1.6666666666666668e-07, "loss": -0.0216, "num_tokens": 63223645.0, "reward": 0.7771685123443604, "reward_std": 0.19046615064144135, "rewards/accuracy_reward_step": 0.5546875, "rewards/asymmetric_l2_reward": 0.6849054098129272, "rewards/final_brier_reward_step": 0.5631815791130066, "rewards/format_reward_step": 0.9765625, "step": 194 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.49879275653923544, "calib/avg_num_step_conf": 8.5390625, "calib/ece": 0.41517813765182177, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.5291750503299006e-05, "calib/mean_conf": 0.990076923076923, "calib/mu_c": 0.990070422535211, "calib/mu_w": 0.9900857142857143, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.41517813765182177, "calib/std_conf": 0.0008525704443058614, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9347727272727274, "calib/step_q_c_n": 1364.0, "calib/step_q_gap": 0.016338420703384426, "calib/step_q_w": 0.918434306569343, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 833.63671875, "completions/mean_terminated_length": 850.2430419921875, "completions/min_length": 0.0, "completions/min_terminated_length": 561.0, "epoch": 0.208, "grad_norm": 0.005558379925787449, "kl": 0.05661773681640625, "learning_rate": 1.3888888888888888e-07, "loss": -0.0177, "num_tokens": 63543040.0, "reward": 0.7495644688606262, "reward_std": 0.2524225115776062, "rewards/accuracy_reward_step": 0.5546875, "rewards/asymmetric_l2_reward": 0.633357048034668, "rewards/final_brier_reward_step": 0.5626468658447266, "rewards/format_reward_step": 0.9609375, "step": 195 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5098039215686274, "calib/avg_num_step_conf": 9.87890625, "calib/ece": 0.38537549407114624, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00019607843137270375, "calib/mean_conf": 0.9901185770750988, "calib/mu_c": 0.9901960784313726, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38537549407114624, "calib/std_conf": 0.001082455647243412, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9403726708074536, "calib/step_q_c_n": 1610.0, "calib/step_q_gap": 0.008109341101251166, "calib/step_q_w": 0.9322633297062024, "calib/step_q_w_n": 919.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1310.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 776.609375, "completions/mean_terminated_length": 782.7244262695312, "completions/min_length": 0.0, "completions/min_terminated_length": 432.0, "epoch": 0.20906666666666668, "grad_norm": 0.005445533432066441, "kl": 0.0676422119140625, "learning_rate": 1.1111111111111112e-07, "loss": -0.0158, "num_tokens": 63844396.0, "reward": 0.8032974004745483, "reward_std": 0.16581255197525024, "rewards/accuracy_reward_step": 0.6015625, "rewards/asymmetric_l2_reward": 0.6832548379898071, "rewards/final_brier_reward_step": 0.6053711175918579, "rewards/format_reward_step": 0.98828125, "step": 196 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4868421052631579, "calib/avg_num_step_conf": 8.44140625, "calib/ece": 0.45165991902834, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00026315789473696505, "calib/mean_conf": 0.9901214574898785, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9902631578947368, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.45165991902834, "calib/std_conf": 0.0010953643124266147, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9346570972886762, "calib/step_q_c_n": 1254.0, "calib/step_q_gap": 0.013863271489337636, "calib/step_q_w": 0.9207938257993386, "calib/step_q_w_n": 907.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2790.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 829.27734375, "completions/mean_terminated_length": 852.59033203125, "completions/min_length": 0.0, "completions/min_terminated_length": 539.0, "epoch": 0.21013333333333334, "grad_norm": 0.005342363379895687, "kl": 0.056148529052734375, "learning_rate": 8.333333333333334e-08, "loss": -0.0289, "num_tokens": 64161747.0, "reward": 0.7426227331161499, "reward_std": 0.24280357360839844, "rewards/accuracy_reward_step": 0.51953125, "rewards/asymmetric_l2_reward": 0.6602627038955688, "rewards/final_brier_reward_step": 0.528107762336731, "rewards/format_reward_step": 0.96484375, "step": 197 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5008438818565402, "calib/avg_num_step_conf": 9.2109375, "calib/ece": 0.3530080645161291, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.991561181417637e-05, "calib/mean_conf": 0.9901048387096775, "calib/mu_c": 0.9901265822784808, "calib/mu_w": 0.9900666666666667, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3530080645161291, "calib/std_conf": 0.0009698565606206837, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9380546923555003, "calib/step_q_c_n": 1609.0, "calib/step_q_gap": 0.014708897962976919, "calib/step_q_w": 0.9233457943925234, "calib/step_q_w_n": 749.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 821.0703125, "completions/mean_terminated_length": 844.152587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 505.0, "epoch": 0.2112, "grad_norm": 0.005194542929530144, "kl": 0.06021881103515625, "learning_rate": 5.555555555555556e-08, "loss": -0.0094, "num_tokens": 64477325.0, "reward": 0.8296013474464417, "reward_std": 0.24484550952911377, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.7179390788078308, "rewards/final_brier_reward_step": 0.6240760684013367, "rewards/format_reward_step": 0.96875, "step": 198 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.49473684210526314, "calib/avg_num_step_conf": 8.80859375, "calib/ece": 0.36553359683794473, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00010526315789449736, "calib/mean_conf": 0.9900395256916996, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9901052631578943, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36553359683794473, "calib/std_conf": 0.0006274509038097849, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9359807692307693, "calib/step_q_c_n": 1560.0, "calib/step_q_gap": 0.014700193691201013, "calib/step_q_w": 0.9212805755395683, "calib/step_q_w_n": 695.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2158.0, "completions/max_terminated_length": 2158.0, "completions/mean_length": 862.57421875, "completions/mean_terminated_length": 869.3661499023438, "completions/min_length": 0.0, "completions/min_terminated_length": 416.0, "epoch": 0.21226666666666666, "grad_norm": 0.005453071091324091, "kl": 0.05690765380859375, "learning_rate": 2.777777777777778e-08, "loss": -0.0268, "num_tokens": 64802344.0, "reward": 0.8347955942153931, "reward_std": 0.24365174770355225, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.7249236106872559, "rewards/final_brier_reward_step": 0.6243550181388855, "rewards/format_reward_step": 0.984375, "step": 199 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5115928831605492, "calib/avg_num_step_conf": 8.40625, "calib/ece": 0.33111111111111124, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.000465116279070088, "calib/mean_conf": 0.98984126984127, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9895348837209298, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.33111111111111124, "calib/std_conf": 0.0021763982858416025, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9363337547408344, "calib/step_q_c_n": 1582.0, "calib/step_q_gap": 0.02156182491627301, "calib/step_q_w": 0.9147719298245613, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1801.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 853.05859375, "completions/mean_terminated_length": 859.7755737304688, "completions/min_length": 0.0, "completions/min_terminated_length": 422.0, "epoch": 0.21333333333333335, "grad_norm": 0.005459709092974663, "kl": 0.05413818359375, "learning_rate": 0.0, "loss": -0.0148, "num_tokens": 65128775.0, "reward": 0.8560566306114197, "reward_std": 0.16525955498218536, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.7319831848144531, "rewards/final_brier_reward_step": 0.6551300287246704, "rewards/format_reward_step": 0.9765625, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 2.0760135896515566, "train_runtime": 19368.0064, "train_samples_per_second": 2.644, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 65128775, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }