{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "calib/answer_extract_rate": 0.06640625, "calib/auroc": 0.75, "calib/avg_num_step_conf": 0.359375, "calib/ece": 0.5285714285714285, "calib/final_conf_rate": 0.0546875, "calib/format_rate": 0.04296875, "calib/frac_conf_gt_0.9": 0.9285714285714286, "calib/gap": 0.02833333333333321, "calib/mean_conf": 0.9571428571428572, "calib/mu_c": 0.9733333333333333, "calib/mu_w": 0.9450000000000001, "calib/nonempty_final_conf_rate": 0.0546875, "calib/nonempty_reasoning_rate": 0.078125, "calib/nonempty_step_conf_rate": 0.05859375, "calib/pce": 0.5285714285714285, "calib/std_conf": 0.033896601479156206, "calib/step_conf_rate": 0.05859375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2853.0, "completions/max_terminated_length": 2853.0, "completions/mean_length": 658.8203125, "completions/mean_terminated_length": 714.6525268554688, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0010666666666666667, "grad_norm": 0.0997648537158966, "learning_rate": 2.5000000000000004e-07, "loss": 0.0135, "num_tokens": 276242.0, "reward": 0.045721352100372314, "reward_std": 0.08725354075431824, "rewards/accuracy_reward_step": 0.02734375, "rewards/final_brier_reward_step": 0.024793751537799835, "rewards/format_reward_step": 0.04296875, "rewards/stepwise_brier_reward": 0.03152916580438614, "step": 1 }, { "calib/answer_extract_rate": 0.06640625, "calib/auroc": 0.1851851851851852, "calib/avg_num_step_conf": 0.24609375, "calib/ece": 0.2141666666666665, "calib/final_conf_rate": 0.046875, "calib/format_rate": 0.046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.01666666666666672, "calib/mean_conf": 0.9641666666666665, "calib/mu_c": 0.9599999999999999, "calib/mu_w": 0.9766666666666666, "calib/nonempty_final_conf_rate": 0.046875, "calib/nonempty_reasoning_rate": 0.06640625, "calib/nonempty_step_conf_rate": 0.0546875, "calib/pce": 0.2141666666666665, "calib/std_conf": 0.014409680388158833, "calib/step_conf_rate": 0.0546875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2927.0, "completions/max_terminated_length": 2927.0, "completions/mean_length": 749.54296875, "completions/mean_terminated_length": 820.0128784179688, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0021333333333333334, "grad_norm": 0.08957482129335403, "learning_rate": 5.000000000000001e-07, "loss": -0.0133, "num_tokens": 571413.0, "reward": 0.06095839664340019, "reward_std": 0.14099186658859253, "rewards/accuracy_reward_step": 0.0390625, "rewards/final_brier_reward_step": 0.03563320264220238, "rewards/format_reward_step": 0.046875, "rewards/stepwise_brier_reward": 0.03320039063692093, "step": 2 }, { "calib/answer_extract_rate": 0.0390625, "calib/auroc": 0.7142857142857143, "calib/avg_num_step_conf": 0.203125, "calib/ece": 0.09499999999999997, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.02734375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.011428571428571566, "calib/mean_conf": 0.97, "calib/mu_c": 0.9714285714285715, "calib/mu_w": 0.96, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.03125, "calib/pce": 0.09499999999999997, "calib/std_conf": 0.026457513110645887, "calib/step_conf_rate": 0.03125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12890625, "completions/max_length": 2881.0, "completions/max_terminated_length": 2881.0, "completions/mean_length": 598.88671875, "completions/mean_terminated_length": 687.51123046875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0032, "grad_norm": 0.012924089096486568, "learning_rate": 7.5e-07, "loss": -0.0063, "num_tokens": 829984.0, "reward": 0.04115130752325058, "reward_std": 0.09515757858753204, "rewards/accuracy_reward_step": 0.02734375, "rewards/final_brier_reward_step": 0.023719141259789467, "rewards/format_reward_step": 0.02734375, "rewards/stepwise_brier_reward": 0.020573580637574196, "step": 3 }, { "calib/answer_extract_rate": 0.03515625, "calib/auroc": 0.8333333333333334, "calib/avg_num_step_conf": 0.1484375, "calib/ece": 0.70375, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.03500000000000003, "calib/mean_conf": 0.95375, "calib/mu_c": 0.98, "calib/mu_w": 0.945, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.04296875, "calib/nonempty_step_conf_rate": 0.02734375, "calib/pce": 0.70375, "calib/std_conf": 0.03425547401511179, "calib/step_conf_rate": 0.02734375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 2746.0, "completions/max_terminated_length": 2746.0, "completions/mean_length": 705.43359375, "completions/mean_terminated_length": 775.0686645507812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.004266666666666667, "grad_norm": 0.1843508929014206, "learning_rate": 1.0000000000000002e-06, "loss": 0.0021, "num_tokens": 1116743.0, "reward": 0.013817477971315384, "reward_std": 0.03473982587456703, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.005259375087916851, "rewards/format_reward_step": 0.01953125, "rewards/stepwise_brier_reward": 0.010948040522634983, "step": 4 }, { "calib/answer_extract_rate": 0.0390625, "calib/auroc": 0.5625, "calib/avg_num_step_conf": 0.08203125, "calib/ece": 0.7370000000000001, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.8, "calib/gap": 0.028749999999999942, "calib/mean_conf": 0.9369999999999999, "calib/mu_c": 0.96, "calib/mu_w": 0.93125, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.7370000000000001, "calib/std_conf": 0.06783067152844648, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 685.53515625, "completions/mean_terminated_length": 783.4688110351562, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.005333333333333333, "grad_norm": 0.007154698017984629, "learning_rate": 1.25e-06, "loss": 0.0224, "num_tokens": 1398928.0, "reward": 0.01371738314628601, "reward_std": 0.03683717921376228, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.006437499541789293, "rewards/format_reward_step": 0.01953125, "rewards/stepwise_brier_reward": 0.009369528852403164, "step": 5 }, { "calib/answer_extract_rate": 0.03515625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 0.12109375, "calib/ece": 0.5619999999999999, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 0.9, "calib/gap": 0.0050000000000001155, "calib/mean_conf": 0.962, "calib/mu_c": 0.9650000000000001, "calib/mu_w": 0.96, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.0390625, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.5619999999999999, "calib/std_conf": 0.029933259094191537, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 3057.0, "completions/max_terminated_length": 3057.0, "completions/mean_length": 583.2265625, "completions/mean_terminated_length": 660.64599609375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0064, "grad_norm": 0.26431453227996826, "learning_rate": 1.5e-06, "loss": 0.0118, "num_tokens": 1654186.0, "reward": 0.022690270096063614, "reward_std": 0.05280275642871857, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.011999217793345451, "rewards/format_reward_step": 0.015625, "rewards/stepwise_brier_reward": 0.010011857375502586, "step": 6 }, { "calib/answer_extract_rate": 0.03515625, "calib/auroc": 0.8333333333333334, "calib/avg_num_step_conf": 0.15234375, "calib/ece": 0.8114285714285716, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.8571428571428571, "calib/gap": 0.029999999999999916, "calib/mean_conf": 0.9542857142857144, "calib/mu_c": 0.98, "calib/mu_w": 0.9500000000000001, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.03515625, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.8114285714285716, "calib/std_conf": 0.030169588688489804, "calib/step_conf_rate": 0.0234375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14453125, "completions/max_length": 3064.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 763.421875, "completions/mean_terminated_length": 892.4017944335938, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.007466666666666667, "grad_norm": 0.005451935809105635, "learning_rate": 1.75e-06, "loss": 0.015, "num_tokens": 1957046.0, "reward": 0.010876026004552841, "reward_std": 0.02719159796833992, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.00556796882301569, "rewards/format_reward_step": 0.0234375, "rewards/stepwise_brier_reward": 0.01293613389134407, "step": 7 }, { "calib/answer_extract_rate": 0.04296875, "calib/auroc": 0.34375, "calib/avg_num_step_conf": 0.28515625, "calib/ece": 0.625, "calib/final_conf_rate": 0.046875, "calib/format_rate": 0.03515625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.012499999999999956, "calib/mean_conf": 0.9583333333333334, "calib/mu_c": 0.95, "calib/mu_w": 0.9624999999999999, "calib/nonempty_final_conf_rate": 0.046875, "calib/nonempty_reasoning_rate": 0.05859375, "calib/nonempty_step_conf_rate": 0.0546875, "calib/pce": 0.625, "calib/std_conf": 0.01818118685772619, "calib/step_conf_rate": 0.0546875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 3016.0, "completions/max_terminated_length": 3016.0, "completions/mean_length": 755.81640625, "completions/mean_terminated_length": 823.357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.008533333333333334, "grad_norm": 0.013414675369858742, "learning_rate": 2.0000000000000003e-06, "loss": 0.0208, "num_tokens": 2257047.0, "reward": 0.02883915603160858, "reward_std": 0.06699974834918976, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.013510936871170998, "rewards/format_reward_step": 0.03515625, "rewards/stepwise_brier_reward": 0.02528318762779236, "step": 8 }, { "calib/answer_extract_rate": 0.03125, "calib/auroc": 0.25, "calib/avg_num_step_conf": 0.11328125, "calib/ece": 0.46499999999999997, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.010000000000000009, "calib/mean_conf": 0.965, "calib/mu_c": 0.96, "calib/mu_w": 0.97, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.03515625, "calib/nonempty_step_conf_rate": 0.02734375, "calib/pce": 0.46499999999999997, "calib/std_conf": 0.011180339887498959, "calib/step_conf_rate": 0.02734375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 2895.0, "completions/max_terminated_length": 2895.0, "completions/mean_length": 646.27734375, "completions/mean_terminated_length": 735.3200073242188, "completions/min_length": 0.0, "completions/min_terminated_length": 17.0, "epoch": 0.0096, "grad_norm": 0.009137723594903946, "learning_rate": 2.25e-06, "loss": -0.0052, "num_tokens": 2530030.0, "reward": 0.02405240572988987, "reward_std": 0.05674830451607704, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.012311328202486038, "rewards/format_reward_step": 0.0234375, "rewards/stepwise_brier_reward": 0.012023289687931538, "step": 9 }, { "calib/answer_extract_rate": 0.0390625, "calib/auroc": 0.6000000000000001, "calib/avg_num_step_conf": 0.16796875, "calib/ece": 0.345, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.02734375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.97, "calib/mu_c": 0.97, "calib/mu_w": 0.9700000000000001, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.0390625, "calib/pce": 0.345, "calib/std_conf": 0.010000000000000009, "calib/step_conf_rate": 0.0390625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2929.0, "completions/max_terminated_length": 2929.0, "completions/mean_length": 776.16015625, "completions/mean_terminated_length": 838.3839111328125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.010666666666666666, "grad_norm": 0.01142524927854538, "learning_rate": 2.5e-06, "loss": -0.0074, "num_tokens": 2835527.0, "reward": 0.030315101146697998, "reward_std": 0.06212481111288071, "rewards/accuracy_reward_step": 0.01953125, "rewards/final_brier_reward_step": 0.016300391405820847, "rewards/format_reward_step": 0.02734375, "rewards/stepwise_brier_reward": 0.015897512435913086, "step": 10 }, { "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 0.04296875, "calib/ece": 0.96, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.96, "calib/mu_c": NaN, "calib/mu_w": 0.96, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.01953125, "calib/nonempty_step_conf_rate": 0.015625, "calib/pce": 0.96, "calib/std_conf": 0.010000000000000009, "calib/step_conf_rate": 0.015625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13671875, "completions/max_length": 3068.0, "completions/max_terminated_length": 3068.0, "completions/mean_length": 782.84375, "completions/mean_terminated_length": 906.8235473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.011733333333333333, "grad_norm": 0.003873982233926654, "learning_rate": 2.7500000000000004e-06, "loss": -0.0017, "num_tokens": 3140415.0, "reward": 0.0011316046584397554, "reward_std": 0.003200661391019821, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0002308593684574589, "rewards/format_reward_step": 0.00390625, "rewards/stepwise_brier_reward": 0.002733059460297227, "step": 11 }, { "calib/answer_extract_rate": 0.08203125, "calib/auroc": 0.5486111111111112, "calib/avg_num_step_conf": 0.36328125, "calib/ece": 0.4882352941176471, "calib/final_conf_rate": 0.06640625, "calib/format_rate": 0.046875, "calib/frac_conf_gt_0.9": 0.9411764705882353, "calib/gap": 0.028194444444444522, "calib/mean_conf": 0.9588235294117647, "calib/mu_c": 0.97375, "calib/mu_w": 0.9455555555555555, "calib/nonempty_final_conf_rate": 0.06640625, "calib/nonempty_reasoning_rate": 0.0859375, "calib/nonempty_step_conf_rate": 0.05859375, "calib/pce": 0.4882352941176471, "calib/std_conf": 0.06057395605579295, "calib/step_conf_rate": 0.05859375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 3034.0, "completions/max_terminated_length": 3034.0, "completions/mean_length": 674.2421875, "completions/mean_terminated_length": 725.2353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0128, "grad_norm": 0.017339495941996574, "learning_rate": 3e-06, "loss": -0.0056, "num_tokens": 3417197.0, "reward": 0.05750560015439987, "reward_std": 0.11294373869895935, "rewards/accuracy_reward_step": 0.0390625, "rewards/final_brier_reward_step": 0.02464335970580578, "rewards/format_reward_step": 0.046875, "rewards/stepwise_brier_reward": 0.030379030853509903, "step": 12 }, { "calib/answer_extract_rate": 0.05859375, "calib/auroc": 0.4666666666666666, "calib/avg_num_step_conf": 0.578125, "calib/ece": 0.58375, "calib/final_conf_rate": 0.0625, "calib/format_rate": 0.046875, "calib/frac_conf_gt_0.9": 0.9375, "calib/gap": -0.0033333333333332993, "calib/mean_conf": 0.95875, "calib/mu_c": 0.9566666666666667, "calib/mu_w": 0.96, "calib/nonempty_final_conf_rate": 0.0625, "calib/nonempty_reasoning_rate": 0.08203125, "calib/nonempty_step_conf_rate": 0.078125, "calib/pce": 0.58375, "calib/std_conf": 0.027810744326608728, "calib/step_conf_rate": 0.078125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 2900.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 776.078125, "completions/mean_terminated_length": 863.8086547851562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.013866666666666666, "grad_norm": 0.02277466468513012, "learning_rate": 3.2500000000000002e-06, "loss": -0.0352, "num_tokens": 3720465.0, "reward": 0.0410991869866848, "reward_std": 0.10069207847118378, "rewards/accuracy_reward_step": 0.0234375, "rewards/final_brier_reward_step": 0.024826953187584877, "rewards/format_reward_step": 0.046875, "rewards/stepwise_brier_reward": 0.02706979028880596, "step": 13 }, { "calib/answer_extract_rate": 0.07421875, "calib/auroc": 0.39880952380952384, "calib/avg_num_step_conf": 0.42578125, "calib/ece": 0.3042105263157895, "calib/final_conf_rate": 0.07421875, "calib/format_rate": 0.05859375, "calib/frac_conf_gt_0.9": 0.8421052631578947, "calib/gap": 0.09940476190476188, "calib/mean_conf": 0.9042105263157895, "calib/mu_c": 0.9408333333333333, "calib/mu_w": 0.8414285714285714, "calib/nonempty_final_conf_rate": 0.07421875, "calib/nonempty_reasoning_rate": 0.078125, "calib/nonempty_step_conf_rate": 0.0703125, "calib/pce": 0.28842105263157897, "calib/std_conf": 0.21546316792273787, "calib/step_conf_rate": 0.0703125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 3068.0, "completions/max_terminated_length": 3068.0, "completions/mean_length": 814.32421875, "completions/mean_terminated_length": 890.8846435546875, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.014933333333333333, "grad_norm": 0.015021421946585178, "learning_rate": 3.5e-06, "loss": -0.0044, "num_tokens": 4034332.0, "reward": 0.074186772108078, "reward_std": 0.17084990441799164, "rewards/accuracy_reward_step": 0.046875, "rewards/final_brier_reward_step": 0.04705468565225601, "rewards/format_reward_step": 0.05859375, "rewards/stepwise_brier_reward": 0.0387549065053463, "step": 14 }, { "calib/answer_extract_rate": 0.07421875, "calib/auroc": 0.42500000000000004, "calib/avg_num_step_conf": 0.42578125, "calib/ece": 0.6499999999999999, "calib/final_conf_rate": 0.06640625, "calib/format_rate": 0.06640625, "calib/frac_conf_gt_0.9": 0.8235294117647058, "calib/gap": 0.011166666666666547, "calib/mean_conf": 0.9441176470588236, "calib/mu_c": 0.952, "calib/mu_w": 0.9408333333333334, "calib/nonempty_final_conf_rate": 0.06640625, "calib/nonempty_reasoning_rate": 0.0859375, "calib/nonempty_step_conf_rate": 0.08203125, "calib/pce": 0.6499999999999999, "calib/std_conf": 0.0590173278173365, "calib/step_conf_rate": 0.08203125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 3018.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 686.26171875, "completions/mean_terminated_length": 770.5394897460938, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.016, "grad_norm": 0.017280153930187225, "learning_rate": 3.7500000000000005e-06, "loss": -0.0017, "num_tokens": 4317895.0, "reward": 0.042646847665309906, "reward_std": 0.09332282841205597, "rewards/accuracy_reward_step": 0.01953125, "rewards/final_brier_reward_step": 0.02463945373892784, "rewards/format_reward_step": 0.06640625, "rewards/stepwise_brier_reward": 0.04126042127609253, "step": 15 }, { "calib/answer_extract_rate": 0.05859375, "calib/auroc": 0.6181818181818183, "calib/avg_num_step_conf": 0.51953125, "calib/ece": 0.6456250000000001, "calib/final_conf_rate": 0.0625, "calib/format_rate": 0.05078125, "calib/frac_conf_gt_0.9": 0.875, "calib/gap": 0.011454545454545384, "calib/mean_conf": 0.9581249999999999, "calib/mu_c": 0.966, "calib/mu_w": 0.9545454545454546, "calib/nonempty_final_conf_rate": 0.0625, "calib/nonempty_reasoning_rate": 0.078125, "calib/nonempty_step_conf_rate": 0.07421875, "calib/pce": 0.6456250000000001, "calib/std_conf": 0.0371178444282531, "calib/step_conf_rate": 0.07421875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 3040.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 807.828125, "completions/mean_terminated_length": 883.77783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.017066666666666667, "grad_norm": 0.011121639981865883, "learning_rate": 4.000000000000001e-06, "loss": -0.0114, "num_tokens": 4633547.0, "reward": 0.04202704131603241, "reward_std": 0.09346764534711838, "rewards/accuracy_reward_step": 0.0234375, "rewards/final_brier_reward_step": 0.021848436444997787, "rewards/format_reward_step": 0.05078125, "rewards/stepwise_brier_reward": 0.03219722583889961, "step": 16 }, { "calib/answer_extract_rate": 0.140625, "calib/auroc": 0.6128472222222223, "calib/avg_num_step_conf": 0.84765625, "calib/ece": 0.3025, "calib/final_conf_rate": 0.140625, "calib/format_rate": 0.1171875, "calib/frac_conf_gt_0.9": 0.9166666666666666, "calib/gap": 0.011666666666666714, "calib/mean_conf": 0.955277777777778, "calib/mu_c": 0.9591666666666666, "calib/mu_w": 0.9474999999999999, "calib/nonempty_final_conf_rate": 0.140625, "calib/nonempty_reasoning_rate": 0.15625, "calib/nonempty_step_conf_rate": 0.140625, "calib/pce": 0.2955555555555555, "calib/std_conf": 0.04896064809342697, "calib/step_conf_rate": 0.140625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2818.0, "completions/max_terminated_length": 2818.0, "completions/mean_length": 707.8359375, "completions/mean_terminated_length": 794.76318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.018133333333333335, "grad_norm": 0.021026864647865295, "learning_rate": 4.25e-06, "loss": -0.0136, "num_tokens": 4918281.0, "reward": 0.148117333650589, "reward_std": 0.2702571451663971, "rewards/accuracy_reward_step": 0.09375, "rewards/final_brier_reward_step": 0.08581054210662842, "rewards/format_reward_step": 0.1171875, "rewards/stepwise_brier_reward": 0.08478375524282455, "step": 17 }, { "calib/answer_extract_rate": 0.109375, "calib/auroc": 0.34375, "calib/avg_num_step_conf": 0.72265625, "calib/ece": 0.497, "calib/final_conf_rate": 0.1171875, "calib/format_rate": 0.08984375, "calib/frac_conf_gt_0.9": 0.8666666666666667, "calib/gap": -0.021607142857142603, "calib/mean_conf": 0.9536666666666667, "calib/mu_c": 0.9421428571428574, "calib/mu_w": 0.96375, "calib/nonempty_final_conf_rate": 0.1171875, "calib/nonempty_reasoning_rate": 0.13671875, "calib/nonempty_step_conf_rate": 0.125, "calib/pce": 0.492, "calib/std_conf": 0.03311428023610894, "calib/step_conf_rate": 0.125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 3002.0, "completions/max_terminated_length": 3002.0, "completions/mean_length": 778.2421875, "completions/mean_terminated_length": 837.1008911132812, "completions/min_length": 0.0, "completions/min_terminated_length": 30.0, "epoch": 0.0192, "grad_norm": 0.010049736127257347, "learning_rate": 4.5e-06, "loss": -0.0069, "num_tokens": 5228231.0, "reward": 0.09567424654960632, "reward_std": 0.1763695776462555, "rewards/accuracy_reward_step": 0.05859375, "rewards/final_brier_reward_step": 0.05296015739440918, "rewards/format_reward_step": 0.08984375, "rewards/stepwise_brier_reward": 0.059424325823783875, "step": 18 }, { "calib/answer_extract_rate": 0.48046875, "calib/auroc": 0.5018424036281179, "calib/avg_num_step_conf": 2.98046875, "calib/ece": 0.47638655462184876, "calib/final_conf_rate": 0.46484375, "calib/format_rate": 0.40625, "calib/frac_conf_gt_0.9": 0.8991596638655462, "calib/gap": 0.00605158730158728, "calib/mean_conf": 0.9469747899159664, "calib/mu_c": 0.9501785714285713, "calib/mu_w": 0.944126984126984, "calib/nonempty_final_conf_rate": 0.46484375, "calib/nonempty_reasoning_rate": 0.51953125, "calib/nonempty_step_conf_rate": 0.46484375, "calib/pce": 0.47638655462184876, "calib/std_conf": 0.06739516754300073, "calib/step_conf_rate": 0.46484375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3047.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 640.01953125, "completions/mean_terminated_length": 677.04541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.020266666666666665, "grad_norm": 0.030161771923303604, "learning_rate": 4.75e-06, "loss": -0.0134, "num_tokens": 5496836.0, "reward": 0.390655517578125, "reward_std": 0.45137089490890503, "rewards/accuracy_reward_step": 0.21875, "rewards/final_brier_reward_step": 0.23021560907363892, "rewards/format_reward_step": 0.40625, "rewards/stepwise_brier_reward": 0.2949064373970032, "step": 19 }, { "calib/answer_extract_rate": 0.7265625, "calib/auroc": 0.5629500450045004, "calib/avg_num_step_conf": 4.70703125, "calib/ece": 0.4911058201058203, "calib/final_conf_rate": 0.73828125, "calib/format_rate": 0.671875, "calib/frac_conf_gt_0.9": 0.9153439153439153, "calib/gap": 0.012506638163816364, "calib/mean_conf": 0.9567142857142859, "calib/mu_c": 0.9633977272727274, "calib/mu_w": 0.9508910891089111, "calib/nonempty_final_conf_rate": 0.73828125, "calib/nonempty_reasoning_rate": 0.7734375, "calib/nonempty_step_conf_rate": 0.7265625, "calib/pce": 0.4911058201058203, "calib/std_conf": 0.04246882276481746, "calib/step_conf_rate": 0.7265625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2817.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 587.50390625, "completions/mean_terminated_length": 599.2072143554688, "completions/min_length": 0.0, "completions/min_terminated_length": 25.0, "epoch": 0.021333333333333333, "grad_norm": 0.02605314552783966, "learning_rate": 5e-06, "loss": -0.0378, "num_tokens": 5752109.0, "reward": 0.6251773238182068, "reward_std": 0.42109954357147217, "rewards/accuracy_reward_step": 0.34765625, "rewards/final_brier_reward_step": 0.3618890643119812, "rewards/format_reward_step": 0.671875, "rewards/stepwise_brier_reward": 0.47944512963294983, "step": 20 }, { "calib/answer_extract_rate": 0.84765625, "calib/auroc": 0.43017094017094015, "calib/avg_num_step_conf": 5.94921875, "calib/ece": 0.42191244193548394, "calib/final_conf_rate": 0.84765625, "calib/format_rate": 0.796875, "calib/frac_conf_gt_0.9": 0.8940092165898618, "calib/gap": -0.016753417803418658, "calib/mean_conf": 0.9489170502304147, "calib/mu_c": 0.9411965811965812, "calib/mu_w": 0.9579499989999999, "calib/nonempty_final_conf_rate": 0.84765625, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.87890625, "calib/pce": 0.41582949262672814, "calib/std_conf": 0.0714617134028384, "calib/step_conf_rate": 0.87890625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 519.27734375, "completions/mean_terminated_length": 519.27734375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0224, "grad_norm": 0.021605942398309708, "learning_rate": 4.9722222222222224e-06, "loss": -0.0186, "num_tokens": 5988004.0, "reward": 0.797073245048523, "reward_std": 0.5415612459182739, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.45556551218032837, "rewards/format_reward_step": 0.796875, "rewards/stepwise_brier_reward": 0.5702272057533264, "step": 21 }, { "calib/answer_extract_rate": 0.8828125, "calib/auroc": 0.5140350877192983, "calib/avg_num_step_conf": 6.15625, "calib/ece": 0.3825791855203621, "calib/final_conf_rate": 0.86328125, "calib/format_rate": 0.8359375, "calib/frac_conf_gt_0.9": 0.9411764705882353, "calib/gap": 0.009584795321637585, "calib/mean_conf": 0.9519909502262442, "calib/mu_c": 0.9561111111111111, "calib/mu_w": 0.9465263157894735, "calib/nonempty_final_conf_rate": 0.86328125, "calib/nonempty_reasoning_rate": 0.93359375, "calib/nonempty_step_conf_rate": 0.90625, "calib/pce": 0.38221719457013587, "calib/std_conf": 0.08055025642109596, "calib/step_conf_rate": 0.90625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1406.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 455.28125, "completions/mean_terminated_length": 464.3506164550781, "completions/min_length": 0.0, "completions/min_terminated_length": 28.0, "epoch": 0.023466666666666667, "grad_norm": 0.018135301768779755, "learning_rate": 4.944444444444445e-06, "loss": -0.0055, "num_tokens": 6206372.0, "reward": 0.8581201434135437, "reward_std": 0.5193147659301758, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5079483985900879, "rewards/format_reward_step": 0.8359375, "rewards/stepwise_brier_reward": 0.605782151222229, "step": 22 }, { "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.47636872353297033, "calib/avg_num_step_conf": 6.734375, "calib/ece": 0.44255869565217404, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.86328125, "calib/frac_conf_gt_0.9": 0.8956521739130435, "calib/gap": 0.009015350877192962, "calib/mean_conf": 0.9469065217391305, "calib/mu_c": 0.9513750000000001, "calib/mu_w": 0.9423596491228071, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.44255869565217404, "calib/std_conf": 0.06230186821880926, "calib/step_conf_rate": 0.953125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2819.0, "completions/max_terminated_length": 2819.0, "completions/mean_length": 511.16796875, "completions/mean_terminated_length": 515.1929321289062, "completions/min_length": 0.0, "completions/min_terminated_length": 81.0, "epoch": 0.024533333333333334, "grad_norm": 0.017454328015446663, "learning_rate": 4.9166666666666665e-06, "loss": 0.0056, "num_tokens": 6441167.0, "reward": 0.8283648490905762, "reward_std": 0.5576850175857544, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.4851628839969635, "rewards/format_reward_step": 0.86328125, "rewards/stepwise_brier_reward": 0.5923591256141663, "step": 23 }, { "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.5693080612011875, "calib/avg_num_step_conf": 7.3203125, "calib/ece": 0.5824201680672267, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.87890625, "calib/frac_conf_gt_0.9": 0.9117647058823529, "calib/gap": 0.017155210474232963, "calib/mean_conf": 0.9479663865546218, "calib/mu_c": 0.9588505747126436, "calib/mu_w": 0.9416953642384106, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.5824201680672267, "calib/std_conf": 0.056991512381947426, "calib/step_conf_rate": 0.96875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 494.35546875, "completions/mean_terminated_length": 498.2480163574219, "completions/min_length": 0.0, "completions/min_terminated_length": 98.0, "epoch": 0.0256, "grad_norm": 0.02592473290860653, "learning_rate": 4.888888888888889e-06, "loss": -0.022, "num_tokens": 6672234.0, "reward": 0.6669620275497437, "reward_std": 0.529405415058136, "rewards/accuracy_reward_step": 0.34375, "rewards/final_brier_reward_step": 0.3822558522224426, "rewards/format_reward_step": 0.87890625, "rewards/stepwise_brier_reward": 0.5590295195579529, "step": 24 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5177096774193548, "calib/avg_num_step_conf": 6.96484375, "calib/ece": 0.4455421686746987, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.8433734939759037, "calib/gap": -0.0011348387096772328, "calib/mean_conf": 0.941285140562249, "calib/mu_c": 0.94072, "calib/mu_w": 0.9418548387096772, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.4424096385542168, "calib/std_conf": 0.05465439994927749, "calib/step_conf_rate": 0.98046875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 438.1015625, "completions/mean_terminated_length": 445.0555725097656, "completions/min_length": 0.0, "completions/min_terminated_length": 54.0, "epoch": 0.02666666666666667, "grad_norm": 0.01844349503517151, "learning_rate": 4.861111111111111e-06, "loss": -0.0026, "num_tokens": 6887612.0, "reward": 0.8847516179084778, "reward_std": 0.47158464789390564, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5363632440567017, "rewards/format_reward_step": 0.9453125, "rewards/stepwise_brier_reward": 0.6557682752609253, "step": 25 }, { "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.41631245209393075, "calib/avg_num_step_conf": 6.67578125, "calib/ece": 0.4092500000000001, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.8166666666666667, "calib/gap": -0.01285903421364376, "calib/mean_conf": 0.9384166666666668, "calib/mu_c": 0.9323622047244093, "calib/mu_w": 0.9452212389380531, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.4092500000000001, "calib/std_conf": 0.04855402203273746, "calib/step_conf_rate": 0.96875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 458.9375, "completions/mean_terminated_length": 462.5511779785156, "completions/min_length": 0.0, "completions/min_terminated_length": 44.0, "epoch": 0.027733333333333332, "grad_norm": 0.020758124068379402, "learning_rate": 4.833333333333333e-06, "loss": -0.0045, "num_tokens": 7110340.0, "reward": 0.882733166217804, "reward_std": 0.49363207817077637, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.536698043346405, "rewards/format_reward_step": 0.921875, "rewards/stepwise_brier_reward": 0.6411095261573792, "step": 26 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4373590982286634, "calib/avg_num_step_conf": 6.953125, "calib/ece": 0.4666399999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.8, "calib/gap": -0.002698872785829143, "calib/mean_conf": 0.9222400000000001, "calib/mu_c": 0.9207826086956522, "calib/mu_w": 0.9234814814814813, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4644399999999999, "calib/std_conf": 0.08231999999999999, "calib/step_conf_rate": 0.98828125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 443.4296875, "completions/mean_terminated_length": 450.4682922363281, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.0288, "grad_norm": 0.014401337131857872, "learning_rate": 4.805555555555556e-06, "loss": 0.0271, "num_tokens": 7329074.0, "reward": 0.8356722593307495, "reward_std": 0.46682795882225037, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.5097249746322632, "rewards/format_reward_step": 0.953125, "rewards/stepwise_brier_reward": 0.6548388600349426, "step": 27 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.451887417218543, "calib/avg_num_step_conf": 6.234375, "calib/ece": 0.3188844621513945, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6772908366533864, "calib/gap": -0.013780132450331162, "calib/mean_conf": 0.9125099601593624, "calib/mu_c": 0.9070198675496688, "calib/mu_w": 0.9208, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.31490039840637457, "calib/std_conf": 0.0765209323521755, "calib/step_conf_rate": 0.984375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1638.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 480.5234375, "completions/mean_terminated_length": 488.15081787109375, "completions/min_length": 0.0, "completions/min_terminated_length": 68.0, "epoch": 0.029866666666666666, "grad_norm": 0.015667492523789406, "learning_rate": 4.777777777777778e-06, "loss": -0.0432, "num_tokens": 7559032.0, "reward": 1.0260478258132935, "reward_std": 0.41725146770477295, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6260480880737305, "rewards/format_reward_step": 0.953125, "rewards/stepwise_brier_reward": 0.7218932509422302, "step": 28 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4419677419354838, "calib/avg_num_step_conf": 6.98828125, "calib/ece": 0.4039759036144578, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6506024096385542, "calib/gap": -0.004878709677419213, "calib/mean_conf": 0.8973895582329317, "calib/mu_c": 0.89496, "calib/mu_w": 0.8998387096774192, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.39967871485943773, "calib/std_conf": 0.11133731108105972, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2801.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 563.68359375, "completions/mean_terminated_length": 565.8941650390625, "completions/min_length": 0.0, "completions/min_terminated_length": 205.0, "epoch": 0.030933333333333334, "grad_norm": 0.011524133384227753, "learning_rate": 4.75e-06, "loss": -0.0063, "num_tokens": 7810463.0, "reward": 0.9101543426513672, "reward_std": 0.4151514172554016, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5622597932815552, "rewards/format_reward_step": 0.96484375, "rewards/stepwise_brier_reward": 0.7236698865890503, "step": 29 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4676919015246518, "calib/avg_num_step_conf": 6.26953125, "calib/ece": 0.3269354838709677, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.5564516129032258, "calib/gap": 0.0006032605108574618, "calib/mean_conf": 0.8762096774193548, "calib/mu_c": 0.8764748201438849, "calib/mu_w": 0.8758715596330274, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.3213306451612903, "calib/std_conf": 0.11218570967255433, "calib/step_conf_rate": 0.95703125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2550.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 579.28125, "completions/mean_terminated_length": 583.842529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.032, "grad_norm": 0.010599917732179165, "learning_rate": 4.722222222222222e-06, "loss": 0.0324, "num_tokens": 8065743.0, "reward": 0.962745189666748, "reward_std": 0.4372634291648865, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5958484411239624, "rewards/format_reward_step": 0.921875, "rewards/stepwise_brier_reward": 0.6988822221755981, "step": 30 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.43452877397831524, "calib/avg_num_step_conf": 6.76171875, "calib/ece": 0.4191666666666667, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.40476190476190477, "calib/gap": -0.02821517931609674, "calib/mean_conf": 0.8223412698412699, "calib/mu_c": 0.806330275229358, "calib/mu_w": 0.8345454545454547, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40448412698412706, "calib/std_conf": 0.14636493110955562, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 559.8828125, "completions/mean_terminated_length": 571.035888671875, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.03306666666666667, "grad_norm": 0.014690193347632885, "learning_rate": 4.694444444444445e-06, "loss": -0.0065, "num_tokens": 8314985.0, "reward": 0.853190541267395, "reward_std": 0.4030712842941284, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.5545613169670105, "rewards/format_reward_step": 0.98046875, "rewards/stepwise_brier_reward": 0.7628883719444275, "step": 31 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.3865949119373777, "calib/avg_num_step_conf": 5.71875, "calib/ece": 0.27011952191235056, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.199203187250996, "calib/gap": -0.057060665362035135, "calib/mean_conf": 0.7719521912350598, "calib/mu_c": 0.7480821917808219, "calib/mu_w": 0.805142857142857, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.230199203187251, "calib/std_conf": 0.14995010360415126, "calib/step_conf_rate": 0.9765625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2162.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 578.20703125, "completions/mean_terminated_length": 587.3849487304688, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.034133333333333335, "grad_norm": 0.042050715535879135, "learning_rate": 4.666666666666667e-06, "loss": 0.0025, "num_tokens": 8569710.0, "reward": 1.030322790145874, "reward_std": 0.3229885697364807, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6446847915649414, "rewards/format_reward_step": 0.9609375, "rewards/stepwise_brier_reward": 0.7953563928604126, "step": 32 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.41624880990161855, "calib/avg_num_step_conf": 5.3359375, "calib/ece": 0.26031746031746034, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.15079365079365079, "calib/gap": -0.039897175499841286, "calib/mean_conf": 0.7503968253968254, "calib/mu_c": 0.7321897810218978, "calib/mu_w": 0.7720869565217391, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2335317460317461, "calib/std_conf": 0.15904676757367997, "calib/step_conf_rate": 0.984375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2609.0, "completions/max_terminated_length": 2609.0, "completions/mean_length": 587.67578125, "completions/mean_terminated_length": 594.644287109375, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.0352, "grad_norm": 0.011683015152812004, "learning_rate": 4.638888888888889e-06, "loss": 0.0067, "num_tokens": 8827027.0, "reward": 0.9961332082748413, "reward_std": 0.33360162377357483, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6480039358139038, "rewards/format_reward_step": 0.96484375, "rewards/stepwise_brier_reward": 0.7943413257598877, "step": 33 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4333769904184187, "calib/avg_num_step_conf": 5.57421875, "calib/ece": 0.18658730158730166, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.14285714285714285, "calib/gap": -0.031401392431240294, "calib/mean_conf": 0.7491269841269841, "calib/mu_c": 0.7380368098159508, "calib/mu_w": 0.7694382022471911, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14444444444444451, "calib/std_conf": 0.15386563836244363, "calib/step_conf_rate": 0.98828125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2543.0, "completions/max_terminated_length": 2543.0, "completions/mean_length": 556.9375, "completions/mean_terminated_length": 563.54150390625, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.03626666666666667, "grad_norm": 0.0294670257717371, "learning_rate": 4.611111111111112e-06, "loss": 0.0084, "num_tokens": 9074715.0, "reward": 1.1192830801010132, "reward_std": 0.42812860012054443, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7105574607849121, "rewards/format_reward_step": 0.98046875, "rewards/stepwise_brier_reward": 0.8275126814842224, "step": 34 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.47167257492748954, "calib/avg_num_step_conf": 5.38671875, "calib/ece": 0.20876984126984127, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.1984126984126984, "calib/gap": -0.011532710280373837, "calib/mean_conf": 0.7728968253968254, "calib/mu_c": 0.768, "calib/mu_w": 0.7795327102803739, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2031349206349206, "calib/std_conf": 0.1402970957264867, "calib/step_conf_rate": 0.984375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2786.0, "completions/max_terminated_length": 2786.0, "completions/mean_length": 673.2109375, "completions/mean_terminated_length": 681.1937255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 232.0, "epoch": 0.037333333333333336, "grad_norm": 0.014074708335101604, "learning_rate": 4.583333333333333e-06, "loss": 0.0076, "num_tokens": 9356313.0, "reward": 1.0285307168960571, "reward_std": 0.3848259449005127, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6772078275680542, "rewards/format_reward_step": 0.97265625, "rewards/stepwise_brier_reward": 0.7822273373603821, "step": 35 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4557664345297314, "calib/avg_num_step_conf": 5.2734375, "calib/ece": 0.15769841269841264, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.1626984126984127, "calib/gap": -0.01092217035279719, "calib/mean_conf": 0.7893650793650794, "calib/mu_c": 0.7862011173184357, "calib/mu_w": 0.7971232876712329, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11837301587301582, "calib/std_conf": 0.11884861674072683, "calib/step_conf_rate": 0.9921875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2284.0, "completions/max_terminated_length": 2284.0, "completions/mean_length": 541.38671875, "completions/mean_terminated_length": 549.9801635742188, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.0384, "grad_norm": 0.014961477369070053, "learning_rate": 4.555555555555556e-06, "loss": -0.0177, "num_tokens": 9597620.0, "reward": 1.1934587955474854, "reward_std": 0.3589988350868225, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7558140754699707, "rewards/format_reward_step": 0.98046875, "rewards/stepwise_brier_reward": 0.8289585709571838, "step": 36 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.4918490917559385, "calib/avg_num_step_conf": 5.47265625, "calib/ece": 0.2806504065040649, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.22764227642276422, "calib/gap": 0.001924279725863176, "calib/mean_conf": 0.8102439024390244, "calib/mu_c": 0.8111278195488721, "calib/mu_w": 0.809203539823009, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2751219512195121, "calib/std_conf": 0.11876713168427873, "calib/step_conf_rate": 0.9921875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2565.0, "completions/max_terminated_length": 2565.0, "completions/mean_length": 586.6015625, "completions/mean_terminated_length": 600.6800537109375, "completions/min_length": 0.0, "completions/min_terminated_length": 232.0, "epoch": 0.039466666666666664, "grad_norm": 0.012317251414060593, "learning_rate": 4.527777777777778e-06, "loss": -0.0407, "num_tokens": 9854886.0, "reward": 0.9720534086227417, "reward_std": 0.3097987771034241, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6391417980194092, "rewards/format_reward_step": 0.95703125, "rewards/stepwise_brier_reward": 0.7881342768669128, "step": 37 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.4722105263157894, "calib/avg_num_step_conf": 5.3984375, "calib/ece": 0.24187755102040814, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.35918367346938773, "calib/gap": -0.004785964912280449, "calib/mean_conf": 0.8541224489795918, "calib/mu_c": 0.8522666666666666, "calib/mu_w": 0.8570526315789471, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24187755102040814, "calib/std_conf": 0.08924482095601899, "calib/step_conf_rate": 0.9921875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2570.0, "completions/max_terminated_length": 2570.0, "completions/mean_length": 633.51953125, "completions/mean_terminated_length": 643.575439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 243.0, "epoch": 0.04053333333333333, "grad_norm": 0.01297785434871912, "learning_rate": 4.5e-06, "loss": -0.0185, "num_tokens": 10123955.0, "reward": 1.0387858152389526, "reward_std": 0.3217565417289734, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6562929749488831, "rewards/format_reward_step": 0.94921875, "rewards/stepwise_brier_reward": 0.7754130363464355, "step": 38 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.49697709923664124, "calib/avg_num_step_conf": 5.390625, "calib/ece": 0.3632421874999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.4375, "calib/gap": -0.0024995419847325673, "calib/mean_conf": 0.8749609374999999, "calib/mu_c": 0.8737404580152672, "calib/mu_w": 0.8762399999999998, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3632421874999999, "calib/std_conf": 0.07939713297167027, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 526.3046875, "completions/mean_terminated_length": 534.6587524414062, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.0416, "grad_norm": 0.012574602849781513, "learning_rate": 4.472222222222223e-06, "loss": -0.0072, "num_tokens": 10364777.0, "reward": 0.9638110995292664, "reward_std": 0.36680635809898376, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6080472469329834, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8018845319747925, "step": 39 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5070136581764488, "calib/avg_num_step_conf": 5.5078125, "calib/ece": 0.3931372549019606, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5019607843137255, "calib/gap": -0.0009966777408636496, "calib/mean_conf": 0.8990196078431373, "calib/mu_c": 0.8985271317829457, "calib/mu_w": 0.8995238095238094, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3931372549019606, "calib/std_conf": 0.050907640397356894, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 539.40234375, "completions/mean_terminated_length": 547.9642944335938, "completions/min_length": 0.0, "completions/min_terminated_length": 220.0, "epoch": 0.042666666666666665, "grad_norm": 0.013815483078360558, "learning_rate": 4.444444444444444e-06, "loss": -0.0276, "num_tokens": 10609624.0, "reward": 0.9423449635505676, "reward_std": 0.36893853545188904, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.590073823928833, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.7652434706687927, "step": 40 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44705344906349925, "calib/avg_num_step_conf": 5.5234375, "calib/ece": 0.14118110236220452, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7440944881889764, "calib/gap": -0.011462768387391353, "calib/mean_conf": 0.9181102362204724, "calib/mu_c": 0.9156281407035175, "calib/mu_w": 0.9270909090909089, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13791338582677146, "calib/std_conf": 0.05131702121979931, "calib/step_conf_rate": 0.9921875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 484.25, "completions/mean_terminated_length": 489.99212646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.04373333333333333, "grad_norm": 0.022481009364128113, "learning_rate": 4.416666666666667e-06, "loss": -0.0098, "num_tokens": 10840840.0, "reward": 1.2885180711746216, "reward_std": 0.29390010237693787, "rewards/accuracy_reward_step": 0.78125, "rewards/final_brier_reward_step": 0.7987339496612549, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8350256681442261, "step": 41 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5179058884700376, "calib/avg_num_step_conf": 5.78515625, "calib/ece": 0.31270588235294106, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8627450980392157, "calib/gap": 0.004680878720915049, "calib/mean_conf": 0.928392156862745, "calib/mu_c": 0.9301910828025476, "calib/mu_w": 0.9255102040816325, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31270588235294106, "calib/std_conf": 0.03032120750827227, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2837.0, "completions/max_terminated_length": 2837.0, "completions/mean_length": 453.78125, "completions/mean_terminated_length": 459.1620788574219, "completions/min_length": 0.0, "completions/min_terminated_length": 231.0, "epoch": 0.0448, "grad_norm": 0.013097383081912994, "learning_rate": 4.388888888888889e-06, "loss": 0.0037, "num_tokens": 11061376.0, "reward": 1.0799758434295654, "reward_std": 0.28886380791664124, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6642890572547913, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8040512800216675, "step": 42 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.484006734006734, "calib/avg_num_step_conf": 5.78515625, "calib/ece": 0.2792941176470587, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8156862745098039, "calib/gap": 0.005181818181818065, "calib/mean_conf": 0.9263529411764705, "calib/mu_c": 0.928181818181818, "calib/mu_w": 0.9229999999999999, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2792941176470587, "calib/std_conf": 0.039442598415683164, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2568.0, "completions/max_terminated_length": 2568.0, "completions/mean_length": 523.44921875, "completions/mean_terminated_length": 529.6561279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.04586666666666667, "grad_norm": 0.011895827017724514, "learning_rate": 4.361111111111112e-06, "loss": 0.0228, "num_tokens": 11300603.0, "reward": 1.115537166595459, "reward_std": 0.3748445510864258, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6917195320129395, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.7938666343688965, "step": 43 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.49388111888111885, "calib/avg_num_step_conf": 6.09375, "calib/ece": 0.3701176470588233, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8470588235294118, "calib/gap": -3.558941058945386e-05, "calib/mean_conf": 0.9291764705882352, "calib/mu_c": 0.929160839160839, "calib/mu_w": 0.9291964285714285, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36925490196078403, "calib/std_conf": 0.030520135354435367, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 561.49609375, "completions/mean_terminated_length": 568.1541748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 242.0, "epoch": 0.046933333333333334, "grad_norm": 0.019324084743857384, "learning_rate": 4.333333333333334e-06, "loss": -0.0064, "num_tokens": 11550666.0, "reward": 1.0062882900238037, "reward_std": 0.34908923506736755, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6146227121353149, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.7777180075645447, "step": 44 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5563143376169011, "calib/avg_num_step_conf": 6.4609375, "calib/ece": 0.2808203124999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.83984375, "calib/gap": 0.003484491690775804, "calib/mean_conf": 0.9269921874999999, "calib/mu_c": 0.9282035928143711, "calib/mu_w": 0.9247191011235953, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2777343749999999, "calib/std_conf": 0.03734258546170635, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 538.859375, "completions/mean_terminated_length": 547.4127197265625, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.048, "grad_norm": 0.01865292713046074, "learning_rate": 4.305555555555556e-06, "loss": -0.0077, "num_tokens": 11793662.0, "reward": 1.1259329319000244, "reward_std": 0.2780539393424988, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6975820064544678, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.7983371019363403, "step": 45 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4712543554006968, "calib/avg_num_step_conf": 6.7109375, "calib/ece": 0.42136546184738954, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8112449799196787, "calib/gap": -0.0003387533875338633, "calib/mean_conf": 0.9273895582329316, "calib/mu_c": 0.9272222222222221, "calib/mu_w": 0.9275609756097559, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42136546184738954, "calib/std_conf": 0.03501022299647165, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 597.08984375, "completions/mean_terminated_length": 604.1699829101562, "completions/min_length": 0.0, "completions/min_terminated_length": 238.0, "epoch": 0.04906666666666667, "grad_norm": 0.01988985389471054, "learning_rate": 4.277777777777778e-06, "loss": 0.0286, "num_tokens": 12051285.0, "reward": 0.9231994152069092, "reward_std": 0.3016362190246582, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5554765462875366, "rewards/format_reward_step": 0.97265625, "rewards/stepwise_brier_reward": 0.779508650302887, "step": 46 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.47534293552812074, "calib/avg_num_step_conf": 6.62890625, "calib/ece": 0.2683333333333331, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6468253968253969, "calib/gap": -0.0031604938271605487, "calib/mean_conf": 0.911190476190476, "calib/mu_c": 0.9100617283950616, "calib/mu_w": 0.9132222222222222, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2683333333333331, "calib/std_conf": 0.043087344712530326, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2556.0, "completions/max_terminated_length": 2556.0, "completions/mean_length": 625.5546875, "completions/mean_terminated_length": 630.4802856445312, "completions/min_length": 0.0, "completions/min_terminated_length": 247.0, "epoch": 0.050133333333333335, "grad_norm": 0.01256301999092102, "learning_rate": 4.25e-06, "loss": 0.0022, "num_tokens": 12317403.0, "reward": 1.107884168624878, "reward_std": 0.2531202435493469, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6836367249488831, "rewards/format_reward_step": 0.98046875, "rewards/stepwise_brier_reward": 0.8244622945785522, "step": 47 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4838095238095238, "calib/avg_num_step_conf": 6.21875, "calib/ece": 0.2968235294117645, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.43529411764705883, "calib/gap": -0.004838095238095419, "calib/mean_conf": 0.8850588235294118, "calib/mu_c": 0.8830666666666666, "calib/mu_w": 0.887904761904762, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2968235294117645, "calib/std_conf": 0.05806602454498716, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 535.32421875, "completions/mean_terminated_length": 543.8214721679688, "completions/min_length": 0.0, "completions/min_terminated_length": 237.0, "epoch": 0.0512, "grad_norm": 0.01294518169015646, "learning_rate": 4.222222222222223e-06, "loss": -0.0111, "num_tokens": 12558134.0, "reward": 1.0580230951309204, "reward_std": 0.3338013291358948, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6613723039627075, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8285326957702637, "step": 48 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4939107827038861, "calib/avg_num_step_conf": 6.85546875, "calib/ece": 0.22717647058823542, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": -0.00014573070607537542, "calib/mean_conf": 0.8741568627450982, "calib/mu_c": 0.8741071428571429, "calib/mu_w": 0.8742528735632182, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22125490196078446, "calib/std_conf": 0.0672948946433501, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2073.0, "completions/max_terminated_length": 2073.0, "completions/mean_length": 585.078125, "completions/mean_terminated_length": 594.3651123046875, "completions/min_length": 0.0, "completions/min_terminated_length": 307.0, "epoch": 0.05226666666666667, "grad_norm": 0.028894979506731033, "learning_rate": 4.194444444444445e-06, "loss": 0.0237, "num_tokens": 12812450.0, "reward": 1.1491636037826538, "reward_std": 0.2942463457584381, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7214332222938538, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8517837524414062, "step": 49 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5399495374264087, "calib/avg_num_step_conf": 6.80859375, "calib/ece": 0.19843749999999982, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.3671875, "calib/gap": 0.008544995794785515, "calib/mean_conf": 0.8781249999999999, "calib/mu_c": 0.8808620689655171, "calib/mu_w": 0.8723170731707316, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19843749999999982, "calib/std_conf": 0.06150774849561636, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 589.7890625, "completions/mean_terminated_length": 599.1508178710938, "completions/min_length": 0.0, "completions/min_terminated_length": 236.0, "epoch": 0.05333333333333334, "grad_norm": 0.01499101985245943, "learning_rate": 4.166666666666667e-06, "loss": -0.0115, "num_tokens": 13068796.0, "reward": 1.1754646301269531, "reward_std": 0.2648048400878906, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7428476214408875, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.8402606248855591, "step": 50 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5011116051578479, "calib/avg_num_step_conf": 7.44921875, "calib/ece": 0.20179282868525894, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4860557768924303, "calib/gap": 0.010431302801244935, "calib/mean_conf": 0.8910358565737051, "calib/mu_c": 0.8942774566473988, "calib/mu_w": 0.8838461538461538, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20179282868525894, "calib/std_conf": 0.06562390358912099, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 707.41796875, "completions/mean_terminated_length": 715.8063354492188, "completions/min_length": 0.0, "completions/min_terminated_length": 230.0, "epoch": 0.0544, "grad_norm": 0.00909256562590599, "learning_rate": 4.138888888888889e-06, "loss": -0.0024, "num_tokens": 13359191.0, "reward": 1.162571668624878, "reward_std": 0.3010524809360504, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.730698823928833, "rewards/format_reward_step": 0.98046875, "rewards/stepwise_brier_reward": 0.82427579164505, "step": 51 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5894594089038533, "calib/avg_num_step_conf": 7.6953125, "calib/ece": 0.11099206349206339, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5476190476190477, "calib/gap": 0.021161616161615715, "calib/mean_conf": 0.896626984126984, "calib/mu_c": 0.9011616161616159, "calib/mu_w": 0.8800000000000002, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11095238095238086, "calib/std_conf": 0.06154211799840743, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2545.0, "completions/max_terminated_length": 2545.0, "completions/mean_length": 654.8203125, "completions/mean_terminated_length": 665.2142944335938, "completions/min_length": 0.0, "completions/min_terminated_length": 263.0, "epoch": 0.055466666666666664, "grad_norm": 0.06300359219312668, "learning_rate": 4.111111111111111e-06, "loss": -0.0044, "num_tokens": 13634777.0, "reward": 1.2927242517471313, "reward_std": 0.21050235629081726, "rewards/accuracy_reward_step": 0.7734375, "rewards/final_brier_reward_step": 0.8098152279853821, "rewards/format_reward_step": 0.984375, "rewards/stepwise_brier_reward": 0.8735817670822144, "step": 52 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5171658986175115, "calib/avg_num_step_conf": 7.78125, "calib/ece": 0.20203124999999975, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.8515625, "calib/gap": 3.0721966205904394e-05, "calib/mean_conf": 0.9285937499999997, "calib/mu_c": 0.9286021505376343, "calib/mu_w": 0.9285714285714284, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20203124999999975, "calib/std_conf": 0.034871065526271186, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1234.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 649.703125, "completions/mean_terminated_length": 660.0159301757812, "completions/min_length": 0.0, "completions/min_terminated_length": 291.0, "epoch": 0.05653333333333333, "grad_norm": 0.010611703619360924, "learning_rate": 4.083333333333334e-06, "loss": -0.0203, "num_tokens": 13906925.0, "reward": 1.228580355644226, "reward_std": 0.21263298392295837, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7593101263046265, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.8487610816955566, "step": 53 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5520249892287807, "calib/avg_num_step_conf": 7.484375, "calib/ece": 0.11270588235294092, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9686274509803922, "calib/gap": 0.007880224041361461, "calib/mean_conf": 0.9401568627450978, "calib/mu_c": 0.941516587677725, "calib/mu_w": 0.9336363636363635, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11270588235294092, "calib/std_conf": 0.02447837985557275, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1907.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 602.51171875, "completions/mean_terminated_length": 614.5139770507812, "completions/min_length": 0.0, "completions/min_terminated_length": 245.0, "epoch": 0.0576, "grad_norm": 0.012896656058728695, "learning_rate": 4.055555555555556e-06, "loss": -0.0189, "num_tokens": 14167400.0, "reward": 1.345977783203125, "reward_std": 0.24776965379714966, "rewards/accuracy_reward_step": 0.82421875, "rewards/final_brier_reward_step": 0.8428671956062317, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8457317352294922, "step": 54 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4736639753940792, "calib/avg_num_step_conf": 7.29296875, "calib/ece": 0.34886274509803916, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0015686274509804088, "calib/mean_conf": 0.9488627450980391, "calib/mu_c": 0.948235294117647, "calib/mu_w": 0.9498039215686274, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34886274509803916, "calib/std_conf": 0.01651739381218445, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1588.0, "completions/max_terminated_length": 1588.0, "completions/mean_length": 659.48828125, "completions/mean_terminated_length": 669.9563598632812, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.058666666666666666, "grad_norm": 0.010405359789729118, "learning_rate": 4.027777777777779e-06, "loss": -0.0046, "num_tokens": 14444053.0, "reward": 1.0639708042144775, "reward_std": 0.3170587718486786, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.634779691696167, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8320409655570984, "step": 55 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5382490575848174, "calib/avg_num_step_conf": 7.15625, "calib/ece": 0.32874509803921553, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.00011438970492649858, "calib/mean_conf": 0.9402745098039215, "calib/mu_c": 0.9403184713375796, "calib/mu_w": 0.9402040816326531, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3266666666666665, "calib/std_conf": 0.03356473314409362, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2531.0, "completions/max_terminated_length": 2531.0, "completions/mean_length": 658.625, "completions/mean_terminated_length": 666.434814453125, "completions/min_length": 0.0, "completions/min_terminated_length": 323.0, "epoch": 0.05973333333333333, "grad_norm": 0.009862474165856838, "learning_rate": 4.000000000000001e-06, "loss": 0.0011, "num_tokens": 14719501.0, "reward": 1.0863800048828125, "reward_std": 0.26529163122177124, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6543871164321899, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8395702242851257, "step": 56 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4058923061610431, "calib/avg_num_step_conf": 7.73828125, "calib/ece": 0.15205645161290315, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9919354838709677, "calib/gap": -0.006986165024385316, "calib/mean_conf": 0.9464112903225805, "calib/mu_c": 0.9449746192893401, "calib/mu_w": 0.9519607843137254, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15205645161290315, "calib/std_conf": 0.01935513272626141, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2894.0, "completions/max_terminated_length": 2894.0, "completions/mean_length": 737.109375, "completions/mean_terminated_length": 745.849853515625, "completions/min_length": 0.0, "completions/min_terminated_length": 373.0, "epoch": 0.0608, "grad_norm": 0.01667666621506214, "learning_rate": 3.972222222222223e-06, "loss": -0.0181, "num_tokens": 15014993.0, "reward": 1.2658354043960571, "reward_std": 0.31671637296676636, "rewards/accuracy_reward_step": 0.76953125, "rewards/final_brier_reward_step": 0.7852960824966431, "rewards/format_reward_step": 0.96484375, "rewards/stepwise_brier_reward": 0.8139827251434326, "step": 57 }, { "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.5374067421939762, "calib/avg_num_step_conf": 8.95703125, "calib/ece": 0.33068548387096774, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.9919354838709677, "calib/gap": 0.001463111356728497, "calib/mean_conf": 0.9516532258064516, "calib/mu_c": 0.9522077922077922, "calib/mu_w": 0.9507446808510637, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33068548387096774, "calib/std_conf": 0.016731053007968816, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 836.4609375, "completions/mean_terminated_length": 853.12353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 297.0, "epoch": 0.06186666666666667, "grad_norm": 0.011689404956996441, "learning_rate": 3.944444444444445e-06, "loss": -0.0337, "num_tokens": 15335447.0, "reward": 1.0376129150390625, "reward_std": 0.40040409564971924, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6320773363113403, "rewards/format_reward_step": 0.93359375, "rewards/stepwise_brier_reward": 0.7386871576309204, "step": 58 }, { "calib/answer_extract_rate": 0.84765625, "calib/auroc": 0.49141959051907225, "calib/avg_num_step_conf": 13.25390625, "calib/ece": 0.3057017543859649, "calib/final_conf_rate": 0.890625, "calib/format_rate": 0.84765625, "calib/frac_conf_gt_0.9": 0.9912280701754386, "calib/gap": -0.006495624840710312, "calib/mean_conf": 0.950438596491228, "calib/mu_c": 0.9481879194630872, "calib/mu_w": 0.9546835443037975, "calib/nonempty_final_conf_rate": 0.890625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3013157894736842, "calib/std_conf": 0.06441219170417527, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2991.0, "completions/max_terminated_length": 2991.0, "completions/mean_length": 867.2109375, "completions/mean_terminated_length": 925.0250244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 242.0, "epoch": 0.06293333333333333, "grad_norm": 0.019393039867281914, "learning_rate": 3.916666666666667e-06, "loss": -0.0338, "num_tokens": 15663701.0, "reward": 0.9843331575393677, "reward_std": 0.48649871349334717, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6006816625595093, "rewards/format_reward_step": 0.84765625, "rewards/stepwise_brier_reward": 0.6694632768630981, "step": 59 }, { "calib/answer_extract_rate": 0.7109375, "calib/auroc": 0.4964037698412698, "calib/avg_num_step_conf": 19.37109375, "calib/ece": 0.2843769633507852, "calib/final_conf_rate": 0.74609375, "calib/format_rate": 0.7109375, "calib/frac_conf_gt_0.9": 0.9947643979057592, "calib/gap": 0.0002284226190473193, "calib/mean_conf": 0.9545340314136125, "calib/mu_c": 0.9546093749999999, "calib/mu_w": 0.9543809523809526, "calib/nonempty_final_conf_rate": 0.74609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2843769633507852, "calib/std_conf": 0.01503739297260751, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 910.171875, "completions/mean_terminated_length": 1078.7222900390625, "completions/min_length": 0.0, "completions/min_terminated_length": 481.0, "epoch": 0.064, "grad_norm": 0.009957793168723583, "learning_rate": 3.88888888888889e-06, "loss": -0.1715, "num_tokens": 16005561.0, "reward": 0.8382906913757324, "reward_std": 0.5560765266418457, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5179914236068726, "rewards/format_reward_step": 0.7109375, "rewards/stepwise_brier_reward": 0.550796389579773, "step": 60 }, { "calib/answer_extract_rate": 0.68359375, "calib/auroc": 0.5552725027018681, "calib/avg_num_step_conf": 22.0703125, "calib/ece": 0.2530112359550561, "calib/final_conf_rate": 0.6953125, "calib/format_rate": 0.68359375, "calib/frac_conf_gt_0.9": 0.9887640449438202, "calib/gap": -0.004014821676701974, "calib/mean_conf": 0.955370786516854, "calib/mu_c": 0.954220472440945, "calib/mu_w": 0.958235294117647, "calib/nonempty_final_conf_rate": 0.6953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24744943820224713, "calib/std_conf": 0.07310457063478315, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 749.63671875, "completions/mean_terminated_length": 979.1173095703125, "completions/min_length": 0.0, "completions/min_terminated_length": 291.0, "epoch": 0.06506666666666666, "grad_norm": 0.01580783724784851, "learning_rate": 3.861111111111112e-06, "loss": -0.2515, "num_tokens": 16301532.0, "reward": 0.8235231637954712, "reward_std": 0.5275648832321167, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5068694353103638, "rewards/format_reward_step": 0.68359375, "rewards/stepwise_brier_reward": 0.5294104814529419, "step": 61 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.4106713598901099, "calib/avg_num_step_conf": 11.28125, "calib/ece": 0.2554308943089431, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9796747967479674, "calib/gap": -0.02974553571428573, "calib/mean_conf": 0.9550243902439025, "calib/mu_c": 0.9472857142857143, "calib/mu_w": 0.97703125, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2353089430894309, "calib/std_conf": 0.1368847039275888, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 841.3515625, "completions/mean_terminated_length": 872.0081176757812, "completions/min_length": 0.0, "completions/min_terminated_length": 435.0, "epoch": 0.06613333333333334, "grad_norm": 0.01019511092454195, "learning_rate": 3.833333333333334e-06, "loss": -0.0594, "num_tokens": 16623998.0, "reward": 1.1744306087493896, "reward_std": 0.36562198400497437, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7024725675582886, "rewards/format_reward_step": 0.9609375, "rewards/stepwise_brier_reward": 0.767124593257904, "step": 62 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5096755419758474, "calib/avg_num_step_conf": 9.265625, "calib/ece": 0.2958300395256918, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": -0.0009200494689365346, "calib/mean_conf": 0.9808102766798419, "calib/mu_c": 0.980522988505747, "calib/mu_w": 0.9814430379746836, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29444664031620565, "calib/std_conf": 0.02345923158510876, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2581.0, "completions/max_terminated_length": 2581.0, "completions/mean_length": 825.765625, "completions/mean_terminated_length": 835.557373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 333.0, "epoch": 0.0672, "grad_norm": 0.0351506732404232, "learning_rate": 3.8055555555555556e-06, "loss": -0.0357, "num_tokens": 16944034.0, "reward": 1.1468162536621094, "reward_std": 0.3491433262825012, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6900019645690918, "rewards/format_reward_step": 0.984375, "rewards/stepwise_brier_reward": 0.7847628593444824, "step": 63 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.46044666230366493, "calib/avg_num_step_conf": 7.6171875, "calib/ece": 0.2349411764705882, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0011788285340311733, "calib/mean_conf": 0.9839607843137255, "calib/mu_c": 0.9836649214659688, "calib/mu_w": 0.98484375, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2349411764705882, "calib/std_conf": 0.007745867422693867, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2534.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 665.09765625, "completions/mean_terminated_length": 672.9841918945312, "completions/min_length": 0.0, "completions/min_terminated_length": 254.0, "epoch": 0.06826666666666667, "grad_norm": 0.020604388788342476, "learning_rate": 3.777777777777778e-06, "loss": -0.0091, "num_tokens": 17218075.0, "reward": 1.2398756742477417, "reward_std": 0.27078700065612793, "rewards/accuracy_reward_step": 0.74609375, "rewards/final_brier_reward_step": 0.753355860710144, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8233342170715332, "step": 64 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5260015659663317, "calib/avg_num_step_conf": 6.7578125, "calib/ece": 0.36392156862745095, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0008710687720210331, "calib/mean_conf": 0.9835294117647059, "calib/mu_c": 0.9838607594936708, "calib/mu_w": 0.9829896907216498, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36392156862745095, "calib/std_conf": 0.006393479954491851, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1733.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 628.9296875, "completions/mean_terminated_length": 638.9127197265625, "completions/min_length": 0.0, "completions/min_terminated_length": 295.0, "epoch": 0.06933333333333333, "grad_norm": 0.015547499991953373, "learning_rate": 3.7500000000000005e-06, "loss": -0.0101, "num_tokens": 17484105.0, "reward": 1.0706870555877686, "reward_std": 0.24188312888145447, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6297671794891357, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.7857930660247803, "step": 65 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5005907960199004, "calib/avg_num_step_conf": 6.62109375, "calib/ece": 0.45437007874015767, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.3631840795901304e-05, "calib/mean_conf": 0.9819291338582679, "calib/mu_c": 0.9819402985074628, "calib/mu_w": 0.9819166666666669, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45437007874015767, "calib/std_conf": 0.0064462417816946865, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 719.33984375, "completions/mean_terminated_length": 730.7579956054688, "completions/min_length": 0.0, "completions/min_terminated_length": 261.0, "epoch": 0.0704, "grad_norm": 0.02049914188683033, "learning_rate": 3.7222222222222225e-06, "loss": -0.017, "num_tokens": 17774608.0, "reward": 0.9499747157096863, "reward_std": 0.293699711561203, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5400254130363464, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.7692484855651855, "step": 66 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5048156210831849, "calib/avg_num_step_conf": 6.16796875, "calib/ece": 0.3442773437500002, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00038228115311023103, "calib/mean_conf": 0.9809960937500002, "calib/mu_c": 0.9811349693251534, "calib/mu_w": 0.9807526881720432, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3442773437500002, "calib/std_conf": 0.0066058083147493025, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1393.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 639.625, "completions/mean_terminated_length": 649.77783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 319.0, "epoch": 0.07146666666666666, "grad_norm": 0.02097323350608349, "learning_rate": 3.694444444444445e-06, "loss": -0.0129, "num_tokens": 18043360.0, "reward": 1.105022668838501, "reward_std": 0.2549349069595337, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6502983570098877, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.8229173421859741, "step": 67 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5325089065497397, "calib/avg_num_step_conf": 5.96484375, "calib/ece": 0.3310671936758893, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.000809125787886944, "calib/mean_conf": 0.9792885375494071, "calib/mu_c": 0.9795731707317074, "calib/mu_w": 0.9787640449438204, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3310671936758893, "calib/std_conf": 0.006433587986728514, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2587.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 627.87109375, "completions/mean_terminated_length": 635.3162231445312, "completions/min_length": 0.0, "completions/min_terminated_length": 270.0, "epoch": 0.07253333333333334, "grad_norm": 0.014935938641428947, "learning_rate": 3.6666666666666666e-06, "loss": 0.0051, "num_tokens": 18308183.0, "reward": 1.106945276260376, "reward_std": 0.21108710765838623, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6549258232116699, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8150425553321838, "step": 68 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.47805589472256144, "calib/avg_num_step_conf": 5.921875, "calib/ece": 0.3346332015810277, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": 0.007863492063492128, "calib/mean_conf": 0.9749494071146246, "calib/mu_c": 0.977777777777778, "calib/mu_w": 0.9699142857142858, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3346332015810277, "calib/std_conf": 0.04982479946714263, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 664.33203125, "completions/mean_terminated_length": 680.2760620117188, "completions/min_length": 0.0, "completions/min_terminated_length": 328.0, "epoch": 0.0736, "grad_norm": 0.017506230622529984, "learning_rate": 3.638888888888889e-06, "loss": -0.0487, "num_tokens": 18582748.0, "reward": 1.0986493825912476, "reward_std": 0.34393033385276794, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6511279940605164, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8169069886207581, "step": 69 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.47662927350427353, "calib/avg_num_step_conf": 5.96875, "calib/ece": 0.3533531746031747, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.996031746031746, "calib/gap": 0.009599358974358707, "calib/mean_conf": 0.9724007936507937, "calib/mu_c": 0.9760576923076922, "calib/mu_w": 0.9664583333333335, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3533531746031747, "calib/std_conf": 0.06115854104070225, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2534.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 699.69921875, "completions/mean_terminated_length": 713.637451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 278.0, "epoch": 0.07466666666666667, "grad_norm": 0.023978127166628838, "learning_rate": 3.6111111111111115e-06, "loss": -0.0237, "num_tokens": 18868863.0, "reward": 1.0643212795257568, "reward_std": 0.23336264491081238, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6300995349884033, "rewards/format_reward_step": 0.984375, "rewards/stepwise_brier_reward": 0.7959352731704712, "step": 70 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4706828133552272, "calib/avg_num_step_conf": 5.765625, "calib/ece": 0.3212352941176471, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9803921568627451, "calib/gap": -0.0060621921182266725, "calib/mean_conf": 0.9687647058823531, "calib/mu_c": 0.9666964285714286, "calib/mu_w": 0.9727586206896552, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3155882352941177, "calib/std_conf": 0.04484552494136328, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1511.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 654.11328125, "completions/mean_terminated_length": 664.49609375, "completions/min_length": 0.0, "completions/min_terminated_length": 295.0, "epoch": 0.07573333333333333, "grad_norm": 0.023347020149230957, "learning_rate": 3.5833333333333335e-06, "loss": -0.0049, "num_tokens": 19140724.0, "reward": 1.1275029182434082, "reward_std": 0.3487013280391693, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6717905402183533, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8147836923599243, "step": 71 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5244332961724265, "calib/avg_num_step_conf": 5.33203125, "calib/ece": 0.4258431372549019, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9921568627450981, "calib/gap": 0.011233742103306854, "calib/mean_conf": 0.9670196078431372, "calib/mu_c": 0.972173913043478, "calib/mu_w": 0.9609401709401711, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4258431372549019, "calib/std_conf": 0.0621993916482697, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1185.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 589.0546875, "completions/mean_terminated_length": 598.40478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 314.0, "epoch": 0.0768, "grad_norm": 0.020961640402674675, "learning_rate": 3.555555555555556e-06, "loss": -0.0128, "num_tokens": 19395930.0, "reward": 0.9852393865585327, "reward_std": 0.20300054550170898, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5698285102844238, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.816441535949707, "step": 72 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4432429834678969, "calib/avg_num_step_conf": 4.94140625, "calib/ece": 0.19309647058823548, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9686274509803922, "calib/gap": -0.026223529411764424, "calib/mean_conf": 0.9517662745098039, "calib/mu_c": 0.9465215686274511, "calib/mu_w": 0.9727450980392155, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1724313725490198, "calib/std_conf": 0.1100040995664018, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2065.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 609.86328125, "completions/mean_terminated_length": 617.0949096679688, "completions/min_length": 0.0, "completions/min_terminated_length": 274.0, "epoch": 0.07786666666666667, "grad_norm": 0.018611526116728783, "learning_rate": 3.5277777777777784e-06, "loss": -0.0127, "num_tokens": 19659087.0, "reward": 1.3118444681167603, "reward_std": 0.2689002752304077, "rewards/accuracy_reward_step": 0.796875, "rewards/final_brier_reward_step": 0.7933633327484131, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8680766224861145, "step": 73 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.49990322580645163, "calib/avg_num_step_conf": 4.55859375, "calib/ece": 0.3624313725490196, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00028709677419325175, "calib/mean_conf": 0.9702745098039215, "calib/mu_c": 0.9703870967741933, "calib/mu_w": 0.9701000000000001, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3624313725490196, "calib/std_conf": 0.005416309543723785, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1635.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 576.46875, "completions/mean_terminated_length": 587.9522094726562, "completions/min_length": 0.0, "completions/min_terminated_length": 248.0, "epoch": 0.07893333333333333, "grad_norm": 0.017507946118712425, "learning_rate": 3.5e-06, "loss": -0.0146, "num_tokens": 19910591.0, "reward": 1.0646164417266846, "reward_std": 0.29628145694732666, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6276878714561462, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8120279312133789, "step": 74 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5252811599591041, "calib/avg_num_step_conf": 4.33203125, "calib/ece": 0.17699218750000023, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006645599033368033, "calib/mean_conf": 0.9699609375000002, "calib/mu_c": 0.9700985221674877, "calib/mu_w": 0.9694339622641509, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17699218750000023, "calib/std_conf": 0.004192445482185043, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1097.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 558.8203125, "completions/mean_terminated_length": 567.6904907226562, "completions/min_length": 0.0, "completions/min_terminated_length": 297.0, "epoch": 0.08, "grad_norm": 0.017421932891011238, "learning_rate": 3.4722222222222224e-06, "loss": 0.0026, "num_tokens": 20158401.0, "reward": 1.3097432851791382, "reward_std": 0.18140952289104462, "rewards/accuracy_reward_step": 0.79296875, "rewards/final_brier_reward_step": 0.8008023500442505, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8678584098815918, "step": 75 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.492156862745098, "calib/avg_num_step_conf": 3.98828125, "calib/ece": 0.3009570312500002, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.30478156173497e-05, "calib/mean_conf": 0.9689257812500002, "calib/mu_c": 0.9689181286549707, "calib/mu_w": 0.9689411764705881, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3009570312500002, "calib/std_conf": 0.005738299863822777, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1765.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 557.28515625, "completions/mean_terminated_length": 566.1309814453125, "completions/min_length": 0.0, "completions/min_terminated_length": 265.0, "epoch": 0.08106666666666666, "grad_norm": 0.021617865189909935, "learning_rate": 3.444444444444445e-06, "loss": -0.0241, "num_tokens": 20404122.0, "reward": 1.1463499069213867, "reward_std": 0.2512900233268738, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.6873643398284912, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8277227878570557, "step": 76 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4576424623594435, "calib/avg_num_step_conf": 3.875, "calib/ece": 0.1919123505976097, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9840637450199203, "calib/gap": -0.015322088812654733, "calib/mean_conf": 0.956215139442231, "calib/mu_c": 0.9529797979797979, "calib/mu_w": 0.9683018867924527, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17964143426294835, "calib/std_conf": 0.09267396438687492, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2347.0, "completions/max_terminated_length": 2347.0, "completions/mean_length": 553.8359375, "completions/mean_terminated_length": 562.6270141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 231.0, "epoch": 0.08213333333333334, "grad_norm": 0.019355909898877144, "learning_rate": 3.416666666666667e-06, "loss": -0.0108, "num_tokens": 20650568.0, "reward": 1.2847660779953003, "reward_std": 0.1536039412021637, "rewards/accuracy_reward_step": 0.78125, "rewards/final_brier_reward_step": 0.7762621641159058, "rewards/format_reward_step": 0.98046875, "rewards/stepwise_brier_reward": 0.8456146717071533, "step": 77 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5263785790031814, "calib/avg_num_step_conf": 3.73828125, "calib/ece": 0.3275000000000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.99609375, "calib/gap": 0.0014846235418876086, "calib/mean_conf": 0.9681250000000001, "calib/mu_c": 0.9686585365853657, "calib/mu_w": 0.9671739130434781, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3275000000000001, "calib/std_conf": 0.00778118724874296, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1085.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 613.16015625, "completions/mean_terminated_length": 622.8928833007812, "completions/min_length": 0.0, "completions/min_terminated_length": 337.0, "epoch": 0.0832, "grad_norm": 0.028165999799966812, "learning_rate": 3.3888888888888893e-06, "loss": 0.0269, "num_tokens": 20915561.0, "reward": 1.1081875562667847, "reward_std": 0.27526000142097473, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.663142204284668, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.8071078658103943, "step": 78 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4855905629386292, "calib/avg_num_step_conf": 3.7421875, "calib/ece": 0.2625098039215687, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": -0.005039569956696743, "calib/mean_conf": 0.9658823529411765, "calib/mu_c": 0.9644198895027625, "calib/mu_w": 0.9694594594594592, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2592941176470589, "calib/std_conf": 0.04956186352461229, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 580.56640625, "completions/mean_terminated_length": 589.7817993164062, "completions/min_length": 0.0, "completions/min_terminated_length": 306.0, "epoch": 0.08426666666666667, "grad_norm": 0.028801538050174713, "learning_rate": 3.3611111111111117e-06, "loss": -0.0137, "num_tokens": 21170562.0, "reward": 1.1963386535644531, "reward_std": 0.1521933674812317, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7207003831863403, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8396539688110352, "step": 79 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.4905820105820106, "calib/avg_num_step_conf": 3.80859375, "calib/ece": 0.2864843750000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002469135802467548, "calib/mean_conf": 0.9700781250000001, "calib/mu_c": 0.97, "calib/mu_w": 0.9702469135802467, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2864843750000001, "calib/std_conf": 0.001766039774290207, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1059.0, "completions/max_terminated_length": 1059.0, "completions/mean_length": 520.53515625, "completions/mean_terminated_length": 528.7976684570312, "completions/min_length": 0.0, "completions/min_terminated_length": 304.0, "epoch": 0.08533333333333333, "grad_norm": 0.025867415592074394, "learning_rate": 3.3333333333333333e-06, "loss": 0.0109, "num_tokens": 21405979.0, "reward": 1.1613820791244507, "reward_std": 0.26657500863075256, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7015234231948853, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.8096296787261963, "step": 80 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5023552123552123, "calib/avg_num_step_conf": 3.66796875, "calib/ece": 0.24501960784313737, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00011196911196931225, "calib/mean_conf": 0.9705098039215687, "calib/mu_c": 0.9705405405405406, "calib/mu_w": 0.9704285714285713, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24501960784313737, "calib/std_conf": 0.0029596803551550158, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2253.0, "completions/max_terminated_length": 2253.0, "completions/mean_length": 591.33984375, "completions/mean_terminated_length": 600.7261962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 306.0, "epoch": 0.0864, "grad_norm": 0.02036810852587223, "learning_rate": 3.3055555555555558e-06, "loss": -0.0144, "num_tokens": 21663610.0, "reward": 1.2185229063034058, "reward_std": 0.27913594245910645, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7379531860351562, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8470759391784668, "step": 81 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.48837209302325585, "calib/avg_num_step_conf": 3.70703125, "calib/ece": 0.30597656250000016, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00011627906976729996, "calib/mean_conf": 0.9700390625000002, "calib/mu_c": 0.9699999999999999, "calib/mu_w": 0.9701162790697672, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30597656250000016, "calib/std_conf": 0.0024202993866655753, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 538.63671875, "completions/mean_terminated_length": 547.1865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 287.0, "epoch": 0.08746666666666666, "grad_norm": 0.013427666388452053, "learning_rate": 3.277777777777778e-06, "loss": 0.0162, "num_tokens": 21907053.0, "reward": 1.1466963291168213, "reward_std": 0.2291615903377533, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6832371354103088, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.8472981452941895, "step": 82 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4529143536875495, "calib/avg_num_step_conf": 3.33203125, "calib/ece": 0.35395256916996054, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0009417129262494495, "calib/mean_conf": 0.9705533596837945, "calib/mu_c": 0.9701923076923075, "calib/mu_w": 0.9711340206185569, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35395256916996054, "calib/std_conf": 0.002286348595095586, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2092.0, "completions/max_terminated_length": 2092.0, "completions/mean_length": 614.5546875, "completions/mean_terminated_length": 621.8419189453125, "completions/min_length": 0.0, "completions/min_terminated_length": 306.0, "epoch": 0.08853333333333334, "grad_norm": 0.020100336521863937, "learning_rate": 3.2500000000000002e-06, "loss": 0.0093, "num_tokens": 22171643.0, "reward": 1.0668973922729492, "reward_std": 0.22004249691963196, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6303879022598267, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8043888807296753, "step": 83 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4567015786278081, "calib/avg_num_step_conf": 3.4609375, "calib/ece": 0.24996078431372543, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.00885928961748661, "calib/mean_conf": 0.9676078431372549, "calib/mu_c": 0.9701092896174863, "calib/mu_w": 0.9612499999999997, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24996078431372543, "calib/std_conf": 0.04508122448681152, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1065.0, "completions/max_terminated_length": 1065.0, "completions/mean_length": 544.75, "completions/mean_terminated_length": 553.3968505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 221.0, "epoch": 0.0896, "grad_norm": 0.02724604308605194, "learning_rate": 3.2222222222222227e-06, "loss": 0.0042, "num_tokens": 22417019.0, "reward": 1.2128814458847046, "reward_std": 0.26271331310272217, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7334933280944824, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8617823719978333, "step": 84 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.519968292314661, "calib/avg_num_step_conf": 3.4921875, "calib/ece": 0.26225296442687757, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9881422924901185, "calib/gap": 0.0035097387890684617, "calib/mean_conf": 0.9680237154150197, "calib/mu_c": 0.9690502793296089, "calib/mu_w": 0.9655405405405404, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2613833992094862, "calib/std_conf": 0.017582314338563027, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2559.0, "completions/max_terminated_length": 2559.0, "completions/mean_length": 603.33203125, "completions/mean_terminated_length": 612.9087524414062, "completions/min_length": 0.0, "completions/min_terminated_length": 320.0, "epoch": 0.09066666666666667, "grad_norm": 0.018631750717759132, "learning_rate": 3.1944444444444443e-06, "loss": -0.0191, "num_tokens": 22679296.0, "reward": 1.1869070529937744, "reward_std": 0.28921443223953247, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7178245782852173, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8376162052154541, "step": 85 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4818158032443746, "calib/avg_num_step_conf": 3.515625, "calib/ece": 0.35633858267716534, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0006619570905286309, "calib/mean_conf": 0.970511811023622, "calib/mu_c": 0.9702564102564102, "calib/mu_w": 0.9709183673469388, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35633858267716534, "calib/std_conf": 0.002686701404460288, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 615.48046875, "completions/mean_terminated_length": 625.2500610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 223.0, "epoch": 0.09173333333333333, "grad_norm": 0.01573004759848118, "learning_rate": 3.1666666666666667e-06, "loss": 0.0215, "num_tokens": 22942371.0, "reward": 1.0722620487213135, "reward_std": 0.2773081660270691, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6307706832885742, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8082774877548218, "step": 86 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4833765112262522, "calib/avg_num_step_conf": 3.71875, "calib/ece": 0.21571146245059297, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9762845849802372, "calib/gap": -0.008424006908462567, "calib/mean_conf": 0.9644071146245059, "calib/mu_c": 0.9624093264248705, "calib/mu_w": 0.9708333333333331, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20863636363636373, "calib/std_conf": 0.04110677637575858, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2094.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 578.265625, "completions/mean_terminated_length": 592.14404296875, "completions/min_length": 0.0, "completions/min_terminated_length": 248.0, "epoch": 0.0928, "grad_norm": 0.020017167553305626, "learning_rate": 3.138888888888889e-06, "loss": -0.0012, "num_tokens": 23195903.0, "reward": 1.2616093158721924, "reward_std": 0.266476571559906, "rewards/accuracy_reward_step": 0.75390625, "rewards/final_brier_reward_step": 0.7646561861038208, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8708432912826538, "step": 87 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5078077158603796, "calib/avg_num_step_conf": 3.671875, "calib/ece": 0.24780392156862732, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.002839099816289048, "calib/mean_conf": 0.9693725490196078, "calib/mu_c": 0.9701630434782608, "calib/mu_w": 0.9673239436619717, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24780392156862732, "calib/std_conf": 0.012159013092552039, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2807.0, "completions/max_terminated_length": 2807.0, "completions/mean_length": 652.81640625, "completions/mean_terminated_length": 663.1785888671875, "completions/min_length": 0.0, "completions/min_terminated_length": 377.0, "epoch": 0.09386666666666667, "grad_norm": 0.011485312134027481, "learning_rate": 3.1111111111111116e-06, "loss": -0.0263, "num_tokens": 23472872.0, "reward": 1.2150477170944214, "reward_std": 0.2322675883769989, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7357933521270752, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8509601354598999, "step": 88 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.48439523439523435, "calib/avg_num_step_conf": 3.73828125, "calib/ece": 0.3584313725490196, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0003399378399379094, "calib/mean_conf": 0.9701960784313726, "calib/mu_c": 0.9700641025641025, "calib/mu_w": 0.9704040404040404, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3584313725490196, "calib/std_conf": 0.0029968585371831866, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 669.63671875, "completions/mean_terminated_length": 680.2659301757812, "completions/min_length": 0.0, "completions/min_terminated_length": 283.0, "epoch": 0.09493333333333333, "grad_norm": 0.015711192041635513, "learning_rate": 3.0833333333333336e-06, "loss": -0.0146, "num_tokens": 23753187.0, "reward": 1.0668054819107056, "reward_std": 0.27132242918014526, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6274691820144653, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8053778409957886, "step": 89 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.49716223698781836, "calib/avg_num_step_conf": 3.73828125, "calib/ece": 0.3086614173228346, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -5.675526024351285e-05, "calib/mean_conf": 0.9700787401574803, "calib/mu_c": 0.9700595238095236, "calib/mu_w": 0.9701162790697672, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3086614173228346, "calib/std_conf": 0.0008838560756158923, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1157.0, "completions/max_terminated_length": 1157.0, "completions/mean_length": 631.1640625, "completions/mean_terminated_length": 643.737060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 282.0, "epoch": 0.096, "grad_norm": 0.014824142679572105, "learning_rate": 3.055555555555556e-06, "loss": -0.0078, "num_tokens": 24018085.0, "reward": 1.1338977813720703, "reward_std": 0.27669697999954224, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6752082109451294, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8400702476501465, "step": 90 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.500965250965251, "calib/avg_num_step_conf": 3.80859375, "calib/ece": 0.2446666666666666, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.9305019305360105e-05, "calib/mean_conf": 0.9701568627450979, "calib/mu_c": 0.9701621621621622, "calib/mu_w": 0.9701428571428569, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2446666666666666, "calib/std_conf": 0.0012425866288435195, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1500.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 687.3125, "completions/mean_terminated_length": 695.4624633789062, "completions/min_length": 0.0, "completions/min_terminated_length": 400.0, "epoch": 0.09706666666666666, "grad_norm": 0.01124529168009758, "learning_rate": 3.0277777777777776e-06, "loss": 0.0103, "num_tokens": 24301749.0, "reward": 1.213853120803833, "reward_std": 0.19600006937980652, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7380957007408142, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8282541036605835, "step": 91 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.500965250965251, "calib/avg_num_step_conf": 3.86328125, "calib/ece": 0.2446666666666666, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.9305019305360105e-05, "calib/mean_conf": 0.9701568627450979, "calib/mu_c": 0.9701621621621622, "calib/mu_w": 0.9701428571428569, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2446666666666666, "calib/std_conf": 0.0012425866288435195, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1262.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 608.96484375, "completions/mean_terminated_length": 618.6309814453125, "completions/min_length": 0.0, "completions/min_terminated_length": 214.0, "epoch": 0.09813333333333334, "grad_norm": 0.060974352061748505, "learning_rate": 3e-06, "loss": -0.0088, "num_tokens": 24564364.0, "reward": 1.2194641828536987, "reward_std": 0.21166513860225677, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7379410266876221, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8524154424667358, "step": 92 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5027027027027027, "calib/avg_num_step_conf": 3.890625, "calib/ece": 0.24738281250000016, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.405405405423114e-05, "calib/mean_conf": 0.9700390625000002, "calib/mu_c": 0.9700540540540541, "calib/mu_w": 0.9699999999999999, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24738281250000016, "calib/std_conf": 0.0006237781024480985, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 621.94921875, "completions/mean_terminated_length": 631.8214721679688, "completions/min_length": 0.0, "completions/min_terminated_length": 308.0, "epoch": 0.0992, "grad_norm": 0.012551271356642246, "learning_rate": 2.9722222222222225e-06, "loss": 0.0043, "num_tokens": 24829359.0, "reward": 1.2152096033096313, "reward_std": 0.2993466854095459, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7383987903594971, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.8318144679069519, "step": 93 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5058139534883721, "calib/avg_num_step_conf": 3.875, "calib/ece": 0.2982031250000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00011627906976763303, "calib/mean_conf": 0.9700781250000001, "calib/mu_c": 0.9701162790697675, "calib/mu_w": 0.9699999999999999, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2982031250000001, "calib/std_conf": 0.0008804240366863013, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 615.2109375, "completions/mean_terminated_length": 624.9761962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 297.0, "epoch": 0.10026666666666667, "grad_norm": 0.024974621832370758, "learning_rate": 2.944444444444445e-06, "loss": -0.0, "num_tokens": 25095533.0, "reward": 1.1495177745819092, "reward_std": 0.22028878331184387, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6906664371490479, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.8199045658111572, "step": 94 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4941522491349481, "calib/avg_num_step_conf": 3.8984375, "calib/ece": 0.303372549019608, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00011764705882322257, "calib/mean_conf": 0.9700392156862746, "calib/mu_c": 0.9700000000000001, "calib/mu_w": 0.9701176470588233, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.303372549019608, "calib/std_conf": 0.0010839431342027658, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1379.0, "completions/max_terminated_length": 1379.0, "completions/mean_length": 622.78125, "completions/mean_terminated_length": 632.6666870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 227.0, "epoch": 0.10133333333333333, "grad_norm": 0.1932401806116104, "learning_rate": 2.916666666666667e-06, "loss": -0.0271, "num_tokens": 25361093.0, "reward": 1.1338911056518555, "reward_std": 0.215902179479599, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.683010995388031, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.7978659868240356, "step": 95 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5025773195876289, "calib/avg_num_step_conf": 3.88671875, "calib/ece": 0.20925490196078445, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.1546391752865794e-05, "calib/mean_conf": 0.9700392156862746, "calib/mu_c": 0.9700515463917526, "calib/mu_w": 0.9699999999999998, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20925490196078445, "calib/std_conf": 0.0006249951941376173, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1270.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 592.59375, "completions/mean_terminated_length": 602.0000610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.1024, "grad_norm": 0.02530534751713276, "learning_rate": 2.888888888888889e-06, "loss": 0.0126, "num_tokens": 25618613.0, "reward": 1.2543809413909912, "reward_std": 0.127590149641037, "rewards/accuracy_reward_step": 0.7578125, "rewards/final_brier_reward_step": 0.7673121690750122, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8220862150192261, "step": 96 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.520141535111595, "calib/avg_num_step_conf": 3.84765625, "calib/ece": 0.3150588235294116, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006341861731082865, "calib/mean_conf": 0.9699607843137253, "calib/mu_c": 0.9701796407185628, "calib/mu_w": 0.9695454545454545, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3150588235294116, "calib/std_conf": 0.002076580752137057, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1141.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 563.3984375, "completions/mean_terminated_length": 572.34130859375, "completions/min_length": 0.0, "completions/min_terminated_length": 272.0, "epoch": 0.10346666666666667, "grad_norm": 0.029310384765267372, "learning_rate": 2.861111111111111e-06, "loss": 0.003, "num_tokens": 25867915.0, "reward": 1.1218373775482178, "reward_std": 0.29154282808303833, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6684753894805908, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8126240968704224, "step": 97 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4967741935483871, "calib/avg_num_step_conf": 3.7109375, "calib/ece": 0.36211764705882354, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -6.451612903235482e-05, "calib/mean_conf": 0.9699607843137255, "calib/mu_c": 0.9699354838709676, "calib/mu_w": 0.97, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36211764705882354, "calib/std_conf": 0.0006249951941376175, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 580.79296875, "completions/mean_terminated_length": 590.011962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 252.0, "epoch": 0.10453333333333334, "grad_norm": 0.015152696520090103, "learning_rate": 2.8333333333333335e-06, "loss": 0.0142, "num_tokens": 26122782.0, "reward": 1.0578248500823975, "reward_std": 0.3458312153816223, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6280070543289185, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.7829799652099609, "step": 98 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5120300751879701, "calib/avg_num_step_conf": 3.671875, "calib/ece": 0.4442687747035574, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00024185463659109896, "calib/mean_conf": 0.9699604743083005, "calib/mu_c": 0.9700751879699246, "calib/mu_w": 0.9698333333333335, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4442687747035574, "calib/std_conf": 0.0010882134306668945, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 586.5234375, "completions/mean_terminated_length": 600.6000366210938, "completions/min_length": 0.0, "completions/min_terminated_length": 232.0, "epoch": 0.1056, "grad_norm": 0.014242448844015598, "learning_rate": 2.805555555555556e-06, "loss": -0.011, "num_tokens": 26378732.0, "reward": 0.9340698719024658, "reward_std": 0.32346072793006897, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5469195246696472, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.7159223556518555, "step": 99 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5057447397563676, "calib/avg_num_step_conf": 3.64453125, "calib/ece": 0.2982031250000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.000116279069767522, "calib/mean_conf": 0.9700781250000001, "calib/mu_c": 0.9701162790697674, "calib/mu_w": 0.9699999999999999, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2982031250000001, "calib/std_conf": 0.0015289363899047615, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1406.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 604.5, "completions/mean_terminated_length": 614.0952758789062, "completions/min_length": 0.0, "completions/min_terminated_length": 303.0, "epoch": 0.10666666666666667, "grad_norm": 0.018832623958587646, "learning_rate": 2.7777777777777783e-06, "loss": -0.004, "num_tokens": 26640892.0, "reward": 1.1526455879211426, "reward_std": 0.2238457053899765, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6906648874282837, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.8324178457260132, "step": 100 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.496875, "calib/avg_num_step_conf": 3.69140625, "calib/ece": 0.34492187500000016, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001250000000002638, "calib/mean_conf": 0.9699218750000002, "calib/mu_c": 0.9698749999999998, "calib/mu_w": 0.9700000000000001, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34492187500000016, "calib/std_conf": 0.0015289363899047615, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1290.0, "completions/max_terminated_length": 1290.0, "completions/mean_length": 591.87109375, "completions/mean_terminated_length": 601.2659301757812, "completions/min_length": 0.0, "completions/min_terminated_length": 293.0, "epoch": 0.10773333333333333, "grad_norm": 0.0126921022310853, "learning_rate": 2.7500000000000004e-06, "loss": -0.0075, "num_tokens": 26899403.0, "reward": 1.0863710641860962, "reward_std": 0.32370680570602417, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6465929746627808, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.7988911271095276, "step": 101 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.4912405303030303, "calib/avg_num_step_conf": 3.75, "calib/ece": 0.31355468750000026, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00012445887445888815, "calib/mean_conf": 0.9698046875000003, "calib/mu_c": 0.9697619047619047, "calib/mu_w": 0.9698863636363636, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31355468750000026, "calib/std_conf": 0.0020636685846675477, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1083.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 550.76171875, "completions/mean_terminated_length": 561.7330932617188, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.1088, "grad_norm": 0.017177334055304527, "learning_rate": 2.7222222222222224e-06, "loss": -0.0021, "num_tokens": 27147094.0, "reward": 1.1249635219573975, "reward_std": 0.17377358675003052, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6760371327400208, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.7988170981407166, "step": 102 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4919365079365079, "calib/avg_num_step_conf": 3.640625, "calib/ece": 0.38172549019607827, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002285714285717999, "calib/mean_conf": 0.9699607843137253, "calib/mu_c": 0.9698666666666664, "calib/mu_w": 0.9700952380952382, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38172549019607827, "calib/std_conf": 0.0016563695750543448, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 593.23046875, "completions/mean_terminated_length": 602.6468505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 230.0, "epoch": 0.10986666666666667, "grad_norm": 0.01597883179783821, "learning_rate": 2.6944444444444444e-06, "loss": -0.0116, "num_tokens": 27403513.0, "reward": 1.029662847518921, "reward_std": 0.30901777744293213, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6095671653747559, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.7668969631195068, "step": 103 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5157319159335289, "calib/avg_num_step_conf": 3.8046875, "calib/ece": 0.4542578125000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00039833822091872584, "calib/mean_conf": 0.9698828125000002, "calib/mu_c": 0.9700757575757575, "calib/mu_w": 0.9696774193548388, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4542578125000002, "calib/std_conf": 0.001871334307344296, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1182.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 591.1953125, "completions/mean_terminated_length": 600.5794067382812, "completions/min_length": 0.0, "completions/min_terminated_length": 253.0, "epoch": 0.11093333333333333, "grad_norm": 0.013716580346226692, "learning_rate": 2.666666666666667e-06, "loss": -0.03, "num_tokens": 27661539.0, "reward": 0.9389413595199585, "reward_std": 0.329437255859375, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5440894365310669, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.7491757869720459, "step": 104 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4879115741668337, "calib/avg_num_step_conf": 3.78125, "calib/ece": 0.33803149606299215, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": -0.003138983503639836, "calib/mean_conf": 0.9677952755905511, "calib/mu_c": 0.9666459627329193, "calib/mu_w": 0.9697849462365591, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33598425196850396, "calib/std_conf": 0.030800689884562565, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 627.65625, "completions/mean_terminated_length": 640.1593627929688, "completions/min_length": 0.0, "completions/min_terminated_length": 312.0, "epoch": 0.112, "grad_norm": 0.02080809883773327, "learning_rate": 2.6388888888888893e-06, "loss": 0.0047, "num_tokens": 27927979.0, "reward": 1.0884490013122559, "reward_std": 0.2966870069503784, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6488890647888184, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.7924066781997681, "step": 105 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5146117817798007, "calib/avg_num_step_conf": 3.90234375, "calib/ece": 0.3326562500000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0005382940827232119, "calib/mean_conf": 0.9693750000000002, "calib/mu_c": 0.9695705521472392, "calib/mu_w": 0.969032258064516, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3326562500000002, "calib/std_conf": 0.0034798527267687666, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 575.84765625, "completions/mean_terminated_length": 584.9881591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 334.0, "epoch": 0.11306666666666666, "grad_norm": 0.02210339345037937, "learning_rate": 2.6111111111111113e-06, "loss": -0.0033, "num_tokens": 28179980.0, "reward": 1.1033616065979004, "reward_std": 0.14004704356193542, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6582687497138977, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.8083028793334961, "step": 106 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5028895768833849, "calib/avg_num_step_conf": 3.8828125, "calib/ece": 0.30183593750000015, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.9855521155949276e-05, "calib/mean_conf": 0.9698046875000002, "calib/mu_c": 0.9698245614035086, "calib/mu_w": 0.9697647058823526, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30183593750000015, "calib/std_conf": 0.0034743672844625645, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 598.16796875, "completions/mean_terminated_length": 607.6627197265625, "completions/min_length": 0.0, "completions/min_terminated_length": 245.0, "epoch": 0.11413333333333334, "grad_norm": 0.020118990913033485, "learning_rate": 2.5833333333333337e-06, "loss": 0.0148, "num_tokens": 28437727.0, "reward": 1.1491875648498535, "reward_std": 0.36914438009262085, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.6871230602264404, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.8377525806427002, "step": 107 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5044014084507042, "calib/avg_num_step_conf": 3.8984375, "calib/ece": 0.24698039215686282, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.0022833741579916245, "calib/mean_conf": 0.9685490196078432, "calib/mu_c": 0.9691847826086956, "calib/mu_w": 0.9669014084507039, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24698039215686282, "calib/std_conf": 0.009152978140624662, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1671.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 636.95703125, "completions/mean_terminated_length": 647.0675048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 222.0, "epoch": 0.1152, "grad_norm": 0.014318176545202732, "learning_rate": 2.5555555555555557e-06, "loss": -0.0231, "num_tokens": 28704020.0, "reward": 1.2126870155334473, "reward_std": 0.3260917067527771, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.73213791847229, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8467346429824829, "step": 108 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5030179847253018, "calib/avg_num_step_conf": 3.91796875, "calib/ece": 0.4476470588235293, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00017553584626739394, "calib/mean_conf": 0.9652941176470587, "calib/mu_c": 0.9653787878787877, "calib/mu_w": 0.9652032520325203, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4476470588235293, "calib/std_conf": 0.01174115289399968, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1820.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 644.87890625, "completions/mean_terminated_length": 655.1151123046875, "completions/min_length": 0.0, "completions/min_terminated_length": 284.0, "epoch": 0.11626666666666667, "grad_norm": 0.022243350744247437, "learning_rate": 2.5277777777777778e-06, "loss": -0.033, "num_tokens": 28973709.0, "reward": 0.9482213258743286, "reward_std": 0.19971056282520294, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5477253794670105, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.784222424030304, "step": 109 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.4972587719298246, "calib/avg_num_step_conf": 4.02734375, "calib/ece": 0.25679687500000004, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00029824561403524186, "calib/mean_conf": 0.959921875, "calib/mu_c": 0.9598333333333332, "calib/mu_w": 0.9601315789473684, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25679687500000004, "calib/std_conf": 0.01533460617310974, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 592.58984375, "completions/mean_terminated_length": 601.99609375, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.11733333333333333, "grad_norm": 0.014417297206819057, "learning_rate": 2.5e-06, "loss": 0.0171, "num_tokens": 29230332.0, "reward": 1.200930118560791, "reward_std": 0.24814897775650024, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7249554395675659, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.8662647604942322, "step": 110 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4847940074906367, "calib/avg_num_step_conf": 4.015625, "calib/ece": 0.2515810276679842, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9802371541501976, "calib/gap": 0.0031340823970038567, "calib/mean_conf": 0.9551383399209485, "calib/mu_c": 0.9560674157303372, "calib/mu_w": 0.9529333333333333, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2515810276679842, "calib/std_conf": 0.025793353167044192, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2449.0, "completions/max_terminated_length": 2449.0, "completions/mean_length": 668.1171875, "completions/mean_terminated_length": 678.7222900390625, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.1184, "grad_norm": 0.011323070153594017, "learning_rate": 2.4722222222222226e-06, "loss": -0.0366, "num_tokens": 29508778.0, "reward": 1.1823463439941406, "reward_std": 0.26640817523002625, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7202441692352295, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8325787782669067, "step": 111 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6160616061606161, "calib/avg_num_step_conf": 4.0703125, "calib/ece": 0.3443307086614172, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.952755905511811, "calib/gap": 0.012426066136025482, "calib/mean_conf": 0.9466929133858267, "calib/mu_c": 0.9516339869281045, "calib/mu_w": 0.939207920792079, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3443307086614172, "calib/std_conf": 0.03518354836801074, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 730.49609375, "completions/mean_terminated_length": 745.0478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 410.0, "epoch": 0.11946666666666667, "grad_norm": 0.00980925653129816, "learning_rate": 2.4444444444444447e-06, "loss": -0.0015, "num_tokens": 29803705.0, "reward": 1.0600612163543701, "reward_std": 0.33109667897224426, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6376851797103882, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8166217803955078, "step": 112 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6284715350616796, "calib/avg_num_step_conf": 4.39453125, "calib/ece": 0.30746093750000003, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8828125, "calib/gap": 0.0185546540009236, "calib/mean_conf": 0.9441796875, "calib/mu_c": 0.950920245398773, "calib/mu_w": 0.9323655913978494, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30746093750000003, "calib/std_conf": 0.03698752610546349, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1286.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 625.97265625, "completions/mean_terminated_length": 635.9087524414062, "completions/min_length": 0.0, "completions/min_terminated_length": 354.0, "epoch": 0.12053333333333334, "grad_norm": 0.010841844603419304, "learning_rate": 2.4166666666666667e-06, "loss": 0.0013, "num_tokens": 30069154.0, "reward": 1.1145427227020264, "reward_std": 0.27028217911720276, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.677658200263977, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8352002501487732, "step": 113 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6687409812409812, "calib/avg_num_step_conf": 4.43359375, "calib/ece": 0.15862204724409432, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8622047244094488, "calib/gap": 0.036111111111111094, "calib/mean_conf": 0.9381496062992125, "calib/mu_c": 0.946111111111111, "calib/mu_w": 0.9099999999999999, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15862204724409432, "calib/std_conf": 0.044440100133156746, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 643.71875, "completions/mean_terminated_length": 656.5418701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 271.0, "epoch": 0.1216, "grad_norm": 0.011671209707856178, "learning_rate": 2.388888888888889e-06, "loss": 0.0125, "num_tokens": 30338970.0, "reward": 1.2861350774765015, "reward_std": 0.16096657514572144, "rewards/accuracy_reward_step": 0.7734375, "rewards/final_brier_reward_step": 0.8070573806762695, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8468576669692993, "step": 114 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.552871529667937, "calib/avg_num_step_conf": 4.30859375, "calib/ece": 0.268235294117647, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7686274509803922, "calib/gap": 0.021274496461622272, "calib/mean_conf": 0.9231372549019607, "calib/mu_c": 0.9304790419161676, "calib/mu_w": 0.9092045454545453, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.268235294117647, "calib/std_conf": 0.06197691524944577, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1493.0, "completions/max_terminated_length": 1493.0, "completions/mean_length": 653.08203125, "completions/mean_terminated_length": 663.4484252929688, "completions/min_length": 0.0, "completions/min_terminated_length": 339.0, "epoch": 0.12266666666666666, "grad_norm": 0.011316956020891666, "learning_rate": 2.361111111111111e-06, "loss": 0.0125, "num_tokens": 30611423.0, "reward": 1.1329056024551392, "reward_std": 0.2924712002277374, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7050547003746033, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8187551498413086, "step": 115 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5695140629351156, "calib/avg_num_step_conf": 3.921875, "calib/ece": 0.25125490196078426, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7254901960784313, "calib/gap": 0.00948412698412704, "calib/mean_conf": 0.9190980392156862, "calib/mu_c": 0.9222222222222222, "calib/mu_w": 0.9127380952380951, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2498823529411764, "calib/std_conf": 0.05277746656052638, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1254.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 709.12109375, "completions/mean_terminated_length": 720.3770141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 274.0, "epoch": 0.12373333333333333, "grad_norm": 0.014017444103956223, "learning_rate": 2.3333333333333336e-06, "loss": -0.0064, "num_tokens": 30897478.0, "reward": 1.1582738161087036, "reward_std": 0.3228253126144409, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7159402370452881, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8468425273895264, "step": 116 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5725265941093279, "calib/avg_num_step_conf": 4.05859375, "calib/ece": 0.37046874999999996, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.6953125, "calib/gap": 0.01609051220562019, "calib/mean_conf": 0.9134374999999999, "calib/mu_c": 0.9207913669064747, "calib/mu_w": 0.9047008547008545, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37046874999999996, "calib/std_conf": 0.05893504766902286, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1463.0, "completions/max_terminated_length": 1463.0, "completions/mean_length": 720.44921875, "completions/mean_terminated_length": 731.8849487304688, "completions/min_length": 0.0, "completions/min_terminated_length": 370.0, "epoch": 0.1248, "grad_norm": 0.015497560612857342, "learning_rate": 2.305555555555556e-06, "loss": 0.0125, "num_tokens": 31188513.0, "reward": 1.0010682344436646, "reward_std": 0.22037434577941895, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6191117763519287, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.813286304473877, "step": 117 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4776532541182824, "calib/avg_num_step_conf": 4.0859375, "calib/ece": 0.29426877470355717, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.849802371541502, "calib/gap": 0.003385093167701858, "calib/mean_conf": 0.9306324110671936, "calib/mu_c": 0.9318633540372669, "calib/mu_w": 0.9284782608695651, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29426877470355717, "calib/std_conf": 0.0425612237106967, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2541.0, "completions/max_terminated_length": 2541.0, "completions/mean_length": 711.99609375, "completions/mean_terminated_length": 720.4387817382812, "completions/min_length": 0.0, "completions/min_terminated_length": 327.0, "epoch": 0.12586666666666665, "grad_norm": 0.0164240263402462, "learning_rate": 2.277777777777778e-06, "loss": 0.0095, "num_tokens": 31474792.0, "reward": 1.0945225954055786, "reward_std": 0.18863898515701294, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6737667918205261, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.7933861017227173, "step": 118 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5697378277153557, "calib/avg_num_step_conf": 4.015625, "calib/ece": 0.2302371541501975, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.83399209486166, "calib/gap": 0.01642247191011237, "calib/mean_conf": 0.9271541501976284, "calib/mu_c": 0.9320224719101122, "calib/mu_w": 0.9155999999999999, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2269169960474307, "calib/std_conf": 0.054517321037011145, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2649.0, "completions/max_terminated_length": 2649.0, "completions/mean_length": 761.62109375, "completions/mean_terminated_length": 770.6522216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 283.0, "epoch": 0.12693333333333334, "grad_norm": 0.01568424701690674, "learning_rate": 2.25e-06, "loss": -0.0091, "num_tokens": 31774831.0, "reward": 1.1913585662841797, "reward_std": 0.2313280701637268, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.736583948135376, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8522878885269165, "step": 119 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6274286714510765, "calib/avg_num_step_conf": 4.1328125, "calib/ece": 0.16611764705882354, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9215686274509803, "calib/gap": 0.030411342552074228, "calib/mean_conf": 0.9386666666666666, "calib/mu_c": 0.9455837563451776, "calib/mu_w": 0.9151724137931033, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16611764705882354, "calib/std_conf": 0.045964464819508014, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2050.0, "completions/max_terminated_length": 2050.0, "completions/mean_length": 712.48046875, "completions/mean_terminated_length": 723.7897338867188, "completions/min_length": 0.0, "completions/min_terminated_length": 387.0, "epoch": 0.128, "grad_norm": 0.01246616430580616, "learning_rate": 2.222222222222222e-06, "loss": -0.0171, "num_tokens": 32063914.0, "reward": 1.2954286336898804, "reward_std": 0.16706159710884094, "rewards/accuracy_reward_step": 0.76953125, "rewards/final_brier_reward_step": 0.8021171689033508, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.9030345678329468, "step": 120 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.4978616662554487, "calib/avg_num_step_conf": 4.20703125, "calib/ece": 0.19609374999999996, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.984375, "calib/gap": 0.0027370671930257995, "calib/mean_conf": 0.95, "calib/mu_c": 0.9506735751295337, "calib/mu_w": 0.9479365079365079, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19609374999999996, "calib/std_conf": 0.019545619713889854, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1554.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 736.43359375, "completions/mean_terminated_length": 748.123046875, "completions/min_length": 0.0, "completions/min_terminated_length": 335.0, "epoch": 0.12906666666666666, "grad_norm": 0.01279258169233799, "learning_rate": 2.1944444444444445e-06, "loss": -0.0089, "num_tokens": 32357497.0, "reward": 1.2658534049987793, "reward_std": 0.2205522656440735, "rewards/accuracy_reward_step": 0.75390625, "rewards/final_brier_reward_step": 0.7766492366790771, "rewards/format_reward_step": 1.0, "rewards/stepwise_brier_reward": 0.871139645576477, "step": 121 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6422010824995901, "calib/avg_num_step_conf": 4.30859375, "calib/ece": 0.2212449799196787, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": 0.009705592914548089, "calib/mean_conf": 0.9521686746987952, "calib/mu_c": 0.9547802197802197, "calib/mu_w": 0.9450746268656716, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2212449799196787, "calib/std_conf": 0.01803932737045692, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2937.0, "completions/max_terminated_length": 2937.0, "completions/mean_length": 764.97265625, "completions/mean_terminated_length": 789.649169921875, "completions/min_length": 0.0, "completions/min_terminated_length": 373.0, "epoch": 0.13013333333333332, "grad_norm": 0.013162679970264435, "learning_rate": 2.166666666666667e-06, "loss": -0.0412, "num_tokens": 32660674.0, "reward": 1.2031543254852295, "reward_std": 0.1738264560699463, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7371456623077393, "rewards/format_reward_step": 0.97265625, "rewards/stepwise_brier_reward": 0.8426592350006104, "step": 122 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5592760180995474, "calib/avg_num_step_conf": 4.359375, "calib/ece": 0.2921653543307086, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.0040522102332056775, "calib/mean_conf": 0.9575196850393702, "calib/mu_c": 0.9588757396449703, "calib/mu_w": 0.9548235294117646, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2921653543307086, "calib/std_conf": 0.013680465061913965, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 875.29296875, "completions/mean_terminated_length": 889.1865844726562, "completions/min_length": 0.0, "completions/min_terminated_length": 484.0, "epoch": 0.1312, "grad_norm": 0.009489997290074825, "learning_rate": 2.138888888888889e-06, "loss": -0.0215, "num_tokens": 32990037.0, "reward": 1.1314826011657715, "reward_std": 0.2205740362405777, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.6881800889968872, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8002502918243408, "step": 123 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.47256357740228705, "calib/avg_num_step_conf": 4.5546875, "calib/ece": 0.20553784860557778, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.000672469704727674, "calib/mean_conf": 0.9585258964143427, "calib/mu_c": 0.9583597883597883, "calib/mu_w": 0.959032258064516, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20553784860557778, "calib/std_conf": 0.010591017330777244, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 792.16015625, "completions/mean_terminated_length": 814.4296875, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.13226666666666667, "grad_norm": 0.009493890218436718, "learning_rate": 2.1111111111111114e-06, "loss": -0.0446, "num_tokens": 33299646.0, "reward": 1.2366223335266113, "reward_std": 0.24891909956932068, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7563284635543823, "rewards/format_reward_step": 0.98046875, "rewards/stepwise_brier_reward": 0.8448482155799866, "step": 124 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5018825301204819, "calib/avg_num_step_conf": 4.75, "calib/ece": 0.303740157480315, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.6429353778657507e-05, "calib/mean_conf": 0.9572834645669291, "calib/mu_c": 0.957289156626506, "calib/mu_w": 0.9572727272727274, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.303740157480315, "calib/std_conf": 0.010504140644238415, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1749.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 812.40234375, "completions/mean_terminated_length": 825.2976684570312, "completions/min_length": 0.0, "completions/min_terminated_length": 336.0, "epoch": 0.13333333333333333, "grad_norm": 0.012744988314807415, "learning_rate": 2.0833333333333334e-06, "loss": -0.0163, "num_tokens": 33612429.0, "reward": 1.1099066734313965, "reward_std": 0.35964831709861755, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6758925914764404, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.7731088995933533, "step": 125 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4965503246753247, "calib/avg_num_step_conf": 4.7578125, "calib/ece": 0.3451200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.992, "calib/gap": -0.001776244588744813, "calib/mean_conf": 0.95776, "calib/mu_c": 0.957077922077922, "calib/mu_w": 0.9588541666666668, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3434400000000001, "calib/std_conf": 0.0181709218258183, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2527.0, "completions/max_terminated_length": 2527.0, "completions/mean_length": 820.06640625, "completions/mean_terminated_length": 839.748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 340.0, "epoch": 0.1344, "grad_norm": 0.009346295148134232, "learning_rate": 2.0555555555555555e-06, "loss": -0.0071, "num_tokens": 33927830.0, "reward": 1.0485689640045166, "reward_std": 0.32061487436294556, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6266070604324341, "rewards/format_reward_step": 0.97265625, "rewards/stepwise_brier_reward": 0.772356390953064, "step": 126 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5184028798119306, "calib/avg_num_step_conf": 5.375, "calib/ece": 0.29692307692307696, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0007970908022334466, "calib/mean_conf": 0.9608906882591094, "calib/mu_c": 0.9611585365853659, "calib/mu_w": 0.9603614457831324, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29692307692307696, "calib/std_conf": 0.009608542206389454, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 802.7890625, "completions/mean_terminated_length": 832.04052734375, "completions/min_length": 0.0, "completions/min_terminated_length": 404.0, "epoch": 0.13546666666666668, "grad_norm": 0.011093104258179665, "learning_rate": 2.027777777777778e-06, "loss": -0.0381, "num_tokens": 34237016.0, "reward": 1.096947193145752, "reward_std": 0.23692533373832703, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6647632718086243, "rewards/format_reward_step": 0.96484375, "rewards/stepwise_brier_reward": 0.7745875120162964, "step": 127 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.48304778428395867, "calib/avg_num_step_conf": 5.94140625, "calib/ece": 0.315, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9959349593495935, "calib/gap": -0.0013294296248100856, "calib/mean_conf": 0.9597154471544714, "calib/mu_c": 0.9592452830188678, "calib/mu_w": 0.9605747126436779, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3141869918699187, "calib/std_conf": 0.013386086895020086, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2534.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 936.2421875, "completions/mean_terminated_length": 962.5621948242188, "completions/min_length": 0.0, "completions/min_terminated_length": 465.0, "epoch": 0.13653333333333334, "grad_norm": 0.014946205541491508, "learning_rate": 2.0000000000000003e-06, "loss": -0.0493, "num_tokens": 34583358.0, "reward": 1.0722033977508545, "reward_std": 0.3123183846473694, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6461589932441711, "rewards/format_reward_step": 0.9609375, "rewards/stepwise_brier_reward": 0.7739043235778809, "step": 128 }, { "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.515956043956044, "calib/avg_num_step_conf": 9.42578125, "calib/ece": 0.23245833333333343, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0011736263736266128, "calib/mean_conf": 0.9616250000000001, "calib/mu_c": 0.9619428571428572, "calib/mu_w": 0.9607692307692306, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23245833333333343, "calib/std_conf": 0.009760944711792336, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2292.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 825.13671875, "completions/mean_terminated_length": 880.1458740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 384.0, "epoch": 0.1376, "grad_norm": 0.008795306086540222, "learning_rate": 1.9722222222222224e-06, "loss": -0.1218, "num_tokens": 34896977.0, "reward": 1.1481738090515137, "reward_std": 0.24225656688213348, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7017394304275513, "rewards/format_reward_step": 0.93359375, "rewards/stepwise_brier_reward": 0.7831431031227112, "step": 129 }, { "calib/answer_extract_rate": 0.890625, "calib/auroc": 0.5411819887429644, "calib/avg_num_step_conf": 12.00390625, "calib/ece": 0.24126637554585154, "calib/final_conf_rate": 0.89453125, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.9868995633187773, "calib/gap": 0.02239587242026264, "calib/mean_conf": 0.9574235807860262, "calib/mu_c": 0.963780487804878, "calib/mu_w": 0.9413846153846154, "calib/nonempty_final_conf_rate": 0.89453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24126637554585154, "calib/std_conf": 0.06479999166611877, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2554.0, "completions/max_terminated_length": 2554.0, "completions/mean_length": 825.1328125, "completions/mean_terminated_length": 910.4913940429688, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.13866666666666666, "grad_norm": 0.009501713328063488, "learning_rate": 1.944444444444445e-06, "loss": -0.1706, "num_tokens": 35213499.0, "reward": 1.083107352256775, "reward_std": 0.39128535985946655, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6647065877914429, "rewards/format_reward_step": 0.890625, "rewards/stepwise_brier_reward": 0.7489725947380066, "step": 130 }, { "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.5774928774928775, "calib/avg_num_step_conf": 9.33203125, "calib/ece": 0.469915611814346, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.003044871794871673, "calib/mean_conf": 0.9635864978902954, "calib/mu_c": 0.9651282051282052, "calib/mu_w": 0.9620833333333335, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.469915611814346, "calib/std_conf": 0.009904922867843452, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2537.0, "completions/max_terminated_length": 2537.0, "completions/mean_length": 921.2734375, "completions/mean_terminated_length": 990.9496459960938, "completions/min_length": 0.0, "completions/min_terminated_length": 492.0, "epoch": 0.13973333333333332, "grad_norm": 0.011380190961062908, "learning_rate": 1.916666666666667e-06, "loss": -0.1174, "num_tokens": 35555553.0, "reward": 0.8316460251808167, "reward_std": 0.3480015993118286, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.49125975370407104, "rewards/format_reward_step": 0.92578125, "rewards/stepwise_brier_reward": 0.636886715888977, "step": 131 }, { "calib/answer_extract_rate": 0.8359375, "calib/auroc": 0.5970933828076684, "calib/avg_num_step_conf": 16.4765625, "calib/ece": 0.19271028037383175, "calib/final_conf_rate": 0.8359375, "calib/format_rate": 0.8359375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00352504638218909, "calib/mean_conf": 0.9637383177570094, "calib/mu_c": 0.9645454545454544, "calib/mu_w": 0.9610204081632653, "calib/nonempty_final_conf_rate": 0.8359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19271028037383175, "calib/std_conf": 0.00981352913988665, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2550.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 846.484375, "completions/mean_terminated_length": 1003.24072265625, "completions/min_length": 0.0, "completions/min_terminated_length": 296.0, "epoch": 0.1408, "grad_norm": 0.012668384239077568, "learning_rate": 1.888888888888889e-06, "loss": -0.1669, "num_tokens": 35877845.0, "reward": 1.0679371356964111, "reward_std": 0.5534745454788208, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.658273458480835, "rewards/format_reward_step": 0.8359375, "rewards/stepwise_brier_reward": 0.7009752988815308, "step": 132 }, { "calib/answer_extract_rate": 0.89453125, "calib/auroc": 0.5456532297231665, "calib/avg_num_step_conf": 9.921875, "calib/ece": 0.3414410480349344, "calib/final_conf_rate": 0.89453125, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.9956331877729258, "calib/gap": 0.007098915331066991, "calib/mean_conf": 0.961528384279476, "calib/mu_c": 0.964225352112676, "calib/mu_w": 0.957126436781609, "calib/nonempty_final_conf_rate": 0.89453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3414410480349344, "calib/std_conf": 0.03218159890169841, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 2793.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 1005.80078125, "completions/mean_terminated_length": 1114.6536865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 310.0, "epoch": 0.14186666666666667, "grad_norm": 0.008075143210589886, "learning_rate": 1.8611111111111113e-06, "loss": -0.1096, "num_tokens": 36241674.0, "reward": 0.9602289199829102, "reward_std": 0.39630305767059326, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.577673077583313, "rewards/format_reward_step": 0.890625, "rewards/stepwise_brier_reward": 0.6882424354553223, "step": 133 }, { "calib/answer_extract_rate": 0.78515625, "calib/auroc": 0.5155305399207838, "calib/avg_num_step_conf": 16.03515625, "calib/ece": 0.3506965174129354, "calib/final_conf_rate": 0.78515625, "calib/format_rate": 0.78515625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003283302063792348, "calib/mean_conf": 0.9626368159203981, "calib/mu_c": 0.9627642276422764, "calib/mu_w": 0.9624358974358972, "calib/nonempty_final_conf_rate": 0.78515625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3506965174129354, "calib/std_conf": 0.010950473662984937, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19140625, "completions/max_length": 2646.0, "completions/max_terminated_length": 2646.0, "completions/mean_length": 921.421875, "completions/mean_terminated_length": 1139.5362548828125, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.14293333333333333, "grad_norm": 0.013021248392760754, "learning_rate": 1.8333333333333333e-06, "loss": -0.1983, "num_tokens": 36586510.0, "reward": 0.8356744050979614, "reward_std": 0.4603855013847351, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.502169132232666, "rewards/format_reward_step": 0.78515625, "rewards/stepwise_brier_reward": 0.6045907735824585, "step": 134 }, { "calib/answer_extract_rate": 0.59765625, "calib/auroc": 0.4562861946572016, "calib/avg_num_step_conf": 25.8671875, "calib/ece": 0.30525974025974034, "calib/final_conf_rate": 0.6015625, "calib/format_rate": 0.59765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0011937231458993836, "calib/mean_conf": 0.9611038961038962, "calib/mu_c": 0.9606930693069309, "calib/mu_w": 0.9618867924528303, "calib/nonempty_final_conf_rate": 0.6015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30525974025974034, "calib/std_conf": 0.008103365367236005, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 2617.0, "completions/max_terminated_length": 2617.0, "completions/mean_length": 849.7734375, "completions/mean_terminated_length": 1264.779052734375, "completions/min_length": 0.0, "completions/min_terminated_length": 655.0, "epoch": 0.144, "grad_norm": 0.011686751618981361, "learning_rate": 1.8055555555555557e-06, "loss": -0.3331, "num_tokens": 36909932.0, "reward": 0.6714417338371277, "reward_std": 0.6035765409469604, "rewards/accuracy_reward_step": 0.39453125, "rewards/final_brier_reward_step": 0.40920817852020264, "rewards/format_reward_step": 0.59765625, "rewards/stepwise_brier_reward": 0.4593711495399475, "step": 135 }, { "calib/answer_extract_rate": 0.640625, "calib/auroc": 0.5343899061394061, "calib/avg_num_step_conf": 20.546875, "calib/ece": 0.36969512195121956, "calib/final_conf_rate": 0.640625, "calib/format_rate": 0.640625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0009493768272041958, "calib/mean_conf": 0.9611585365853659, "calib/mu_c": 0.9615463917525773, "calib/mu_w": 0.9605970149253731, "calib/nonempty_final_conf_rate": 0.640625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36969512195121956, "calib/std_conf": 0.008862045010089754, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.30078125, "completions/max_length": 3037.0, "completions/max_terminated_length": 3037.0, "completions/mean_length": 835.234375, "completions/mean_terminated_length": 1194.525146484375, "completions/min_length": 0.0, "completions/min_terminated_length": 404.0, "epoch": 0.14506666666666668, "grad_norm": 0.015090246684849262, "learning_rate": 1.777777777777778e-06, "loss": -0.323, "num_tokens": 37232240.0, "reward": 0.6637687087059021, "reward_std": 0.5630416870117188, "rewards/accuracy_reward_step": 0.37890625, "rewards/final_brier_reward_step": 0.39851444959640503, "rewards/format_reward_step": 0.640625, "rewards/stepwise_brier_reward": 0.4846852719783783, "step": 136 }, { "calib/answer_extract_rate": 0.73046875, "calib/auroc": 0.49889354763568594, "calib/avg_num_step_conf": 17.50390625, "calib/ece": 0.39572192513369003, "calib/final_conf_rate": 0.73046875, "calib/format_rate": 0.73046875, "calib/frac_conf_gt_0.9": 0.9946524064171123, "calib/gap": -0.0028744467738177626, "calib/mean_conf": 0.9593582887700535, "calib/mu_c": 0.9581132075471699, "calib/mu_w": 0.9609876543209876, "calib/nonempty_final_conf_rate": 0.73046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39411764705882374, "calib/std_conf": 0.020595803873086484, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.24609375, "completions/max_length": 2697.0, "completions/max_terminated_length": 2697.0, "completions/mean_length": 870.0390625, "completions/mean_terminated_length": 1154.0413818359375, "completions/min_length": 0.0, "completions/min_terminated_length": 600.0, "epoch": 0.14613333333333334, "grad_norm": 0.010640913620591164, "learning_rate": 1.75e-06, "loss": -0.2487, "num_tokens": 37561954.0, "reward": 0.7320098876953125, "reward_std": 0.43524622917175293, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.4372335970401764, "rewards/format_reward_step": 0.73046875, "rewards/stepwise_brier_reward": 0.5423684120178223, "step": 137 }, { "calib/answer_extract_rate": 0.75390625, "calib/auroc": 0.45502266762109284, "calib/avg_num_step_conf": 20.7265625, "calib/ece": 0.30124352331606236, "calib/final_conf_rate": 0.75390625, "calib/format_rate": 0.75, "calib/frac_conf_gt_0.9": 0.9948186528497409, "calib/gap": 4.891434025289243e-05, "calib/mean_conf": 0.9592746113989639, "calib/mu_c": 0.959291338582677, "calib/mu_w": 0.9592424242424241, "calib/nonempty_final_conf_rate": 0.75390625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30124352331606236, "calib/std_conf": 0.01480415247653411, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2777.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 821.73046875, "completions/mean_terminated_length": 1073.2806396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 423.0, "epoch": 0.1472, "grad_norm": 0.012203462421894073, "learning_rate": 1.7222222222222224e-06, "loss": -0.241, "num_tokens": 37876653.0, "reward": 0.8494399189949036, "reward_std": 0.46320798993110657, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5117890238761902, "rewards/format_reward_step": 0.75, "rewards/stepwise_brier_reward": 0.6015955209732056, "step": 138 }, { "calib/answer_extract_rate": 0.79296875, "calib/auroc": 0.4723415682062298, "calib/avg_num_step_conf": 19.46875, "calib/ece": 0.3055172413793106, "calib/final_conf_rate": 0.79296875, "calib/format_rate": 0.79296875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0006917293233084099, "calib/mean_conf": 0.960689655172414, "calib/mu_c": 0.9604511278195488, "calib/mu_w": 0.9611428571428572, "calib/nonempty_final_conf_rate": 0.79296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3055172413793106, "calib/std_conf": 0.008739088418339586, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.18359375, "completions/max_length": 2835.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 763.87109375, "completions/mean_terminated_length": 935.6506958007812, "completions/min_length": 0.0, "completions/min_terminated_length": 332.0, "epoch": 0.14826666666666666, "grad_norm": 0.009132412262260914, "learning_rate": 1.6944444444444446e-06, "loss": -0.2696, "num_tokens": 38175300.0, "reward": 0.8950383067131042, "reward_std": 0.49091672897338867, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5394952893257141, "rewards/format_reward_step": 0.79296875, "rewards/stepwise_brier_reward": 0.6453452706336975, "step": 139 }, { "calib/answer_extract_rate": 0.85546875, "calib/auroc": 0.5512957806810881, "calib/avg_num_step_conf": 14.87109375, "calib/ece": 0.22383561643835628, "calib/final_conf_rate": 0.85546875, "calib/format_rate": 0.85546875, "calib/frac_conf_gt_0.9": 0.9954337899543378, "calib/gap": 0.005903833797386793, "calib/mean_conf": 0.9589954337899546, "calib/mu_c": 0.96055900621118, "calib/mu_w": 0.9546551724137932, "calib/nonempty_final_conf_rate": 0.85546875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22383561643835628, "calib/std_conf": 0.02084723686932632, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 865.25, "completions/mean_terminated_length": 997.7658081054688, "completions/min_length": 0.0, "completions/min_terminated_length": 265.0, "epoch": 0.14933333333333335, "grad_norm": 0.011655515059828758, "learning_rate": 1.6666666666666667e-06, "loss": -0.2195, "num_tokens": 38501820.0, "reward": 1.05995774269104, "reward_std": 0.4912075996398926, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.647642970085144, "rewards/format_reward_step": 0.85546875, "rewards/stepwise_brier_reward": 0.7343755960464478, "step": 140 }, { "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5547623119015048, "calib/avg_num_step_conf": 10.14453125, "calib/ece": 0.24087499999999995, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.9916666666666667, "calib/gap": 0.0021101231190151504, "calib/mean_conf": 0.9575416666666666, "calib/mu_c": 0.958139534883721, "calib/mu_w": 0.9560294117647058, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24087499999999995, "calib/std_conf": 0.012153460298294564, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 924.60546875, "completions/mean_terminated_length": 982.153564453125, "completions/min_length": 0.0, "completions/min_terminated_length": 574.0, "epoch": 0.1504, "grad_norm": 0.007671962957829237, "learning_rate": 1.638888888888889e-06, "loss": -0.1046, "num_tokens": 38845615.0, "reward": 1.1376688480377197, "reward_std": 0.3450199365615845, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.69340580701828, "rewards/format_reward_step": 0.9375, "rewards/stepwise_brier_reward": 0.7947694659233093, "step": 141 }, { "calib/answer_extract_rate": 0.8671875, "calib/auroc": 0.5717770034843206, "calib/avg_num_step_conf": 13.859375, "calib/ece": 0.32923423423423426, "calib/final_conf_rate": 0.8671875, "calib/format_rate": 0.8671875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.002106271777003621, "calib/mean_conf": 0.9598648648648649, "calib/mu_c": 0.9606428571428572, "calib/mu_w": 0.9585365853658536, "calib/nonempty_final_conf_rate": 0.8671875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32923423423423426, "calib/std_conf": 0.006186277296043263, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 2269.0, "completions/max_terminated_length": 2269.0, "completions/mean_length": 874.20703125, "completions/mean_terminated_length": 994.6533813476562, "completions/min_length": 0.0, "completions/min_terminated_length": 441.0, "epoch": 0.15146666666666667, "grad_norm": 0.009174701757729053, "learning_rate": 1.6111111111111113e-06, "loss": -0.1348, "num_tokens": 39174572.0, "reward": 0.9420343041419983, "reward_std": 0.32860827445983887, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.572007417678833, "rewards/format_reward_step": 0.8671875, "rewards/stepwise_brier_reward": 0.6617546081542969, "step": 142 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5052585451358458, "calib/avg_num_step_conf": 8.84375, "calib/ece": 0.2995951417004051, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": 0.0016089687408705755, "calib/mean_conf": 0.9595141700404861, "calib/mu_c": 0.9600613496932516, "calib/mu_w": 0.9584523809523811, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2995951417004051, "calib/std_conf": 0.01147814335676527, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2772.0, "completions/max_terminated_length": 2772.0, "completions/mean_length": 915.82421875, "completions/mean_terminated_length": 945.3668823242188, "completions/min_length": 0.0, "completions/min_terminated_length": 413.0, "epoch": 0.15253333333333333, "grad_norm": 0.014339365065097809, "learning_rate": 1.5833333333333333e-06, "loss": -0.0549, "num_tokens": 39516359.0, "reward": 1.0884448289871216, "reward_std": 0.35324692726135254, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6622757315635681, "rewards/format_reward_step": 0.96484375, "rewards/stepwise_brier_reward": 0.7586911916732788, "step": 143 }, { "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.48947368421052634, "calib/avg_num_step_conf": 8.71484375, "calib/ece": 0.19495867768595054, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9958677685950413, "calib/gap": 0.002226647700332096, "calib/mean_conf": 0.9594214876033059, "calib/mu_c": 0.9599459459459461, "calib/mu_w": 0.957719298245614, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19495867768595054, "calib/std_conf": 0.012012219719120395, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 842.0390625, "completions/mean_terminated_length": 879.8448486328125, "completions/min_length": 0.0, "completions/min_terminated_length": 405.0, "epoch": 0.1536, "grad_norm": 0.010931096971035004, "learning_rate": 1.5555555555555558e-06, "loss": -0.0531, "num_tokens": 39836049.0, "reward": 1.2023956775665283, "reward_std": 0.37323635816574097, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7397913932800293, "rewards/format_reward_step": 0.9453125, "rewards/stepwise_brier_reward": 0.8010416030883789, "step": 144 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5710518028656905, "calib/avg_num_step_conf": 7.19921875, "calib/ece": 0.25028340080971695, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9757085020242915, "calib/gap": 0.022280743190049024, "calib/mean_conf": 0.9547368421052633, "calib/mu_c": 0.96132183908046, "calib/mu_w": 0.939041095890411, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25028340080971695, "calib/std_conf": 0.04211791307951345, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2500.0, "completions/max_terminated_length": 2500.0, "completions/mean_length": 871.7265625, "completions/mean_terminated_length": 903.4899291992188, "completions/min_length": 0.0, "completions/min_terminated_length": 399.0, "epoch": 0.15466666666666667, "grad_norm": 0.01119040697813034, "learning_rate": 1.527777777777778e-06, "loss": -0.0553, "num_tokens": 40161915.0, "reward": 1.152696132659912, "reward_std": 0.3916665315628052, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7107648253440857, "rewards/format_reward_step": 0.96484375, "rewards/stepwise_brier_reward": 0.7953319549560547, "step": 145 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.49903944672131145, "calib/avg_num_step_conf": 6.11328125, "calib/ece": 0.4719600000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.992, "calib/gap": -0.0005622438524590656, "calib/mean_conf": 0.95996, "calib/mu_c": 0.9596721311475411, "calib/mu_w": 0.9602343750000002, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4719600000000002, "calib/std_conf": 0.01285295296809259, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2899.0, "completions/max_terminated_length": 2899.0, "completions/mean_length": 875.92578125, "completions/mean_terminated_length": 896.9480590820312, "completions/min_length": 0.0, "completions/min_terminated_length": 400.0, "epoch": 0.15573333333333333, "grad_norm": 0.01072732638567686, "learning_rate": 1.5e-06, "loss": -0.001, "num_tokens": 40493368.0, "reward": 0.8771609663963318, "reward_std": 0.2998144030570984, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.5146011710166931, "rewards/format_reward_step": 0.9765625, "rewards/stepwise_brier_reward": 0.6971675753593445, "step": 146 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4942438513867086, "calib/avg_num_step_conf": 5.515625, "calib/ece": 0.34433070866141724, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9921259842519685, "calib/gap": -0.0002760334903192385, "calib/mean_conf": 0.9585039370078741, "calib/mu_c": 0.9583974358974359, "calib/mu_w": 0.9586734693877551, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34433070866141724, "calib/std_conf": 0.011877242194119151, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2530.0, "completions/max_terminated_length": 2530.0, "completions/mean_length": 831.65234375, "completions/mean_terminated_length": 841.5138549804688, "completions/min_length": 0.0, "completions/min_terminated_length": 392.0, "epoch": 0.1568, "grad_norm": 0.011501497589051723, "learning_rate": 1.4722222222222225e-06, "loss": 0.0032, "num_tokens": 40809951.0, "reward": 1.0649793148040771, "reward_std": 0.3345613181591034, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6391671895980835, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.7863753437995911, "step": 147 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4961053737090846, "calib/avg_num_step_conf": 4.76953125, "calib/ece": 0.18725490196078431, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0007001575354455003, "calib/mean_conf": 0.9598039215686275, "calib/mu_c": 0.9596446700507615, "calib/mu_w": 0.960344827586207, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18725490196078431, "calib/std_conf": 0.008234360357893773, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 735.5546875, "completions/mean_terminated_length": 747.230224609375, "completions/min_length": 0.0, "completions/min_terminated_length": 331.0, "epoch": 0.15786666666666666, "grad_norm": 0.012875870801508427, "learning_rate": 1.4444444444444445e-06, "loss": -0.0103, "num_tokens": 41103365.0, "reward": 1.2776968479156494, "reward_std": 0.20390915870666504, "rewards/accuracy_reward_step": 0.76953125, "rewards/final_brier_reward_step": 0.7858230471611023, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8484017252922058, "step": 148 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5301878354203936, "calib/avg_num_step_conf": 3.9296875, "calib/ece": 0.27052000000000015, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.98, "calib/gap": 0.006420691711389148, "calib/mean_conf": 0.9585199999999999, "calib/mu_c": 0.9605232558139535, "calib/mu_w": 0.9541025641025643, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27052000000000015, "calib/std_conf": 0.017315010828757804, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2647.0, "completions/max_terminated_length": 2647.0, "completions/mean_length": 765.10546875, "completions/mean_terminated_length": 789.7862548828125, "completions/min_length": 0.0, "completions/min_terminated_length": 429.0, "epoch": 0.15893333333333334, "grad_norm": 0.01118408888578415, "learning_rate": 1.4166666666666667e-06, "loss": -0.0297, "num_tokens": 41403688.0, "reward": 1.1512722969055176, "reward_std": 0.2149408757686615, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6977159976959229, "rewards/format_reward_step": 0.97265625, "rewards/stepwise_brier_reward": 0.8308108448982239, "step": 149 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.47813607370659106, "calib/avg_num_step_conf": 3.7109375, "calib/ece": 0.28581027667984205, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": 0.0020262225372077447, "calib/mean_conf": 0.9577470355731227, "calib/mu_c": 0.9584117647058824, "calib/mu_w": 0.9563855421686747, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28581027667984205, "calib/std_conf": 0.01248173531639795, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2627.0, "completions/max_terminated_length": 2627.0, "completions/mean_length": 726.52734375, "completions/mean_terminated_length": 735.142333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.16, "grad_norm": 0.014552582055330276, "learning_rate": 1.3888888888888892e-06, "loss": -0.0061, "num_tokens": 41694639.0, "reward": 1.1458451747894287, "reward_std": 0.24657873809337616, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6904253959655762, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8413926362991333, "step": 150 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4726430976430977, "calib/avg_num_step_conf": 3.28515625, "calib/ece": 0.348470588235294, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9764705882352941, "calib/gap": -0.0028885003885003036, "calib/mean_conf": 0.956313725490196, "calib/mu_c": 0.9551923076923077, "calib/mu_w": 0.958080808080808, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3465098039215685, "calib/std_conf": 0.025043875378300776, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2184.0, "completions/max_terminated_length": 2184.0, "completions/mean_length": 772.09375, "completions/mean_terminated_length": 784.3492431640625, "completions/min_length": 0.0, "completions/min_terminated_length": 361.0, "epoch": 0.16106666666666666, "grad_norm": 0.013441565446555614, "learning_rate": 1.3611111111111112e-06, "loss": -0.0115, "num_tokens": 41999319.0, "reward": 1.0731348991394043, "reward_std": 0.20622123777866364, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6389647722244263, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8191995620727539, "step": 151 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5231050531914896, "calib/avg_num_step_conf": 3.30078125, "calib/ece": 0.32673228346456684, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.968503937007874, "calib/gap": 0.004820478723404298, "calib/mean_conf": 0.9566535433070866, "calib/mu_c": 0.9584374999999999, "calib/mu_w": 0.9536170212765956, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32673228346456684, "calib/std_conf": 0.024961033554450264, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2135.0, "completions/max_terminated_length": 2135.0, "completions/mean_length": 717.578125, "completions/mean_terminated_length": 728.9683227539062, "completions/min_length": 0.0, "completions/min_terminated_length": 377.0, "epoch": 0.16213333333333332, "grad_norm": 0.009903301484882832, "learning_rate": 1.3333333333333334e-06, "loss": 0.0004, "num_tokens": 42288411.0, "reward": 1.0950580835342407, "reward_std": 0.3089412748813629, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6565800905227661, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8267771601676941, "step": 152 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4682703659976387, "calib/avg_num_step_conf": 3.0703125, "calib/ece": 0.2642687747035575, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9762845849802372, "calib/gap": -0.003482142857142545, "calib/mean_conf": 0.957707509881423, "calib/mu_c": 0.9566477272727273, "calib/mu_w": 0.9601298701298698, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.26316205533596854, "calib/std_conf": 0.026440353914862467, "calib/step_conf_rate": 0.98828125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 739.21875, "completions/mean_terminated_length": 753.9442749023438, "completions/min_length": 0.0, "completions/min_terminated_length": 411.0, "epoch": 0.1632, "grad_norm": 0.008799062110483646, "learning_rate": 1.3055555555555556e-06, "loss": 0.0294, "num_tokens": 42584971.0, "reward": 1.1705644130706787, "reward_std": 0.24496926367282867, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7009167671203613, "rewards/format_reward_step": 0.9765625, "rewards/stepwise_brier_reward": 0.8407156467437744, "step": 153 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.45919984155278265, "calib/avg_num_step_conf": 2.96484375, "calib/ece": 0.34948412698412706, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9682539682539683, "calib/gap": 0.00392751039809891, "calib/mean_conf": 0.9566269841269842, "calib/mu_c": 0.958169934640523, "calib/mu_w": 0.9542424242424241, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34948412698412706, "calib/std_conf": 0.023336133888730664, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2352.0, "completions/max_terminated_length": 2352.0, "completions/mean_length": 714.046875, "completions/mean_terminated_length": 728.2709350585938, "completions/min_length": 0.0, "completions/min_terminated_length": 325.0, "epoch": 0.16426666666666667, "grad_norm": 0.0115498723462224, "learning_rate": 1.2777777777777779e-06, "loss": 0.008, "num_tokens": 42872207.0, "reward": 1.0598273277282715, "reward_std": 0.22214645147323608, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6306589841842651, "rewards/format_reward_step": 0.984375, "rewards/stepwise_brier_reward": 0.8242752552032471, "step": 154 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5518433910781142, "calib/avg_num_step_conf": 3.05859375, "calib/ece": 0.3558984375000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9453125, "calib/gap": 0.00935909638936483, "calib/mean_conf": 0.9535546875, "calib/mu_c": 0.9573202614379083, "calib/mu_w": 0.9479611650485434, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3558984375000002, "calib/std_conf": 0.03414498238068579, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1333.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 684.67578125, "completions/mean_terminated_length": 695.543701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.16533333333333333, "grad_norm": 0.010745114646852016, "learning_rate": 1.25e-06, "loss": 0.0037, "num_tokens": 43154700.0, "reward": 1.0507065057754517, "reward_std": 0.41136685013771057, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6323082447052002, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.7814552187919617, "step": 155 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5334151055605115, "calib/avg_num_step_conf": 2.88671875, "calib/ece": 0.25478260869565234, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9446640316205533, "calib/gap": 0.009844632768361672, "calib/mean_conf": 0.9543873517786562, "calib/mu_c": 0.9573446327683616, "calib/mu_w": 0.9474999999999999, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25478260869565234, "calib/std_conf": 0.03352953034624684, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2221.0, "completions/max_terminated_length": 2221.0, "completions/mean_length": 725.1875, "completions/mean_terminated_length": 736.6984252929688, "completions/min_length": 0.0, "completions/min_terminated_length": 336.0, "epoch": 0.1664, "grad_norm": 0.01419080700725317, "learning_rate": 1.2222222222222223e-06, "loss": -0.0203, "num_tokens": 43445108.0, "reward": 1.1801788806915283, "reward_std": 0.29406630992889404, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7191046476364136, "rewards/format_reward_step": 0.984375, "rewards/stepwise_brier_reward": 0.8422356247901917, "step": 156 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5320462409562755, "calib/avg_num_step_conf": 2.7421875, "calib/ece": 0.223764705882353, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9607843137254902, "calib/gap": 0.0062700534759360105, "calib/mean_conf": 0.9570980392156863, "calib/mu_c": 0.9587700534759359, "calib/mu_w": 0.9524999999999999, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.223764705882353, "calib/std_conf": 0.02516241743269683, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 681.20703125, "completions/mean_terminated_length": 689.2846069335938, "completions/min_length": 0.0, "completions/min_terminated_length": 268.0, "epoch": 0.16746666666666668, "grad_norm": 0.012100272811949253, "learning_rate": 1.1944444444444446e-06, "loss": 0.0091, "num_tokens": 43723225.0, "reward": 1.234571099281311, "reward_std": 0.3096979856491089, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.7527780532836914, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8683183193206787, "step": 157 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4596043491392328, "calib/avg_num_step_conf": 2.88671875, "calib/ece": 0.27389558232931754, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.963855421686747, "calib/gap": 0.0038364542434311533, "calib/mean_conf": 0.9565461847389559, "calib/mu_c": 0.957732558139535, "calib/mu_w": 0.9538961038961038, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.26983935742971915, "calib/std_conf": 0.03797217956855723, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2513.0, "completions/max_terminated_length": 2513.0, "completions/mean_length": 657.5625, "completions/mean_terminated_length": 676.0481567382812, "completions/min_length": 0.0, "completions/min_terminated_length": 236.0, "epoch": 0.16853333333333334, "grad_norm": 0.010984676890075207, "learning_rate": 1.1666666666666668e-06, "loss": 0.0078, "num_tokens": 43996801.0, "reward": 1.1524807214736938, "reward_std": 0.26436007022857666, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6960608959197998, "rewards/format_reward_step": 0.96484375, "rewards/stepwise_brier_reward": 0.8247992992401123, "step": 158 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5109325702546043, "calib/avg_num_step_conf": 2.83203125, "calib/ece": 0.2590944881889764, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9251968503937008, "calib/gap": 0.0066380512143232595, "calib/mean_conf": 0.9539763779527559, "calib/mu_c": 0.955988700564972, "calib/mu_w": 0.9493506493506487, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.25811023622047247, "calib/std_conf": 0.03226879347837574, "calib/step_conf_rate": 0.98828125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 651.796875, "completions/mean_terminated_length": 662.1428833007812, "completions/min_length": 0.0, "completions/min_terminated_length": 320.0, "epoch": 0.1696, "grad_norm": 0.030613403767347336, "learning_rate": 1.138888888888889e-06, "loss": -0.003, "num_tokens": 44268445.0, "reward": 1.1845319271087646, "reward_std": 0.3275502622127533, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7070433497428894, "rewards/format_reward_step": 0.98046875, "rewards/stepwise_brier_reward": 0.8732717037200928, "step": 159 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5020308123249301, "calib/avg_num_step_conf": 2.734375, "calib/ece": 0.2885039370078742, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9488188976377953, "calib/gap": 0.0032577030812325214, "calib/mean_conf": 0.9552755905511812, "calib/mu_c": 0.9563529411764705, "calib/mu_w": 0.953095238095238, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.28724409448818916, "calib/std_conf": 0.03526785629498021, "calib/step_conf_rate": 0.984375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2264.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 643.8359375, "completions/mean_terminated_length": 654.0556030273438, "completions/min_length": 0.0, "completions/min_terminated_length": 256.0, "epoch": 0.17066666666666666, "grad_norm": 0.010678802616894245, "learning_rate": 1.111111111111111e-06, "loss": -0.0026, "num_tokens": 44538107.0, "reward": 1.1360284090042114, "reward_std": 0.3467368483543396, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6760109663009644, "rewards/format_reward_step": 0.9765625, "rewards/stepwise_brier_reward": 0.8212274312973022, "step": 160 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5623973727422004, "calib/avg_num_step_conf": 2.7265625, "calib/ece": 0.16181102362204752, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9645669291338582, "calib/gap": 0.0096551724137931, "calib/mean_conf": 0.9577165354330709, "calib/mu_c": 0.9596551724137932, "calib/mu_w": 0.9500000000000001, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1601574803149609, "calib/std_conf": 0.03413011177434172, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2082.0, "completions/max_terminated_length": 2082.0, "completions/mean_length": 647.796875, "completions/mean_terminated_length": 658.0794067382812, "completions/min_length": 0.0, "completions/min_terminated_length": 242.0, "epoch": 0.17173333333333332, "grad_norm": 0.010070580057799816, "learning_rate": 1.0833333333333335e-06, "loss": -0.0, "num_tokens": 44807863.0, "reward": 1.3078510761260986, "reward_std": 0.1625467985868454, "rewards/accuracy_reward_step": 0.796875, "rewards/final_brier_reward_step": 0.8060562610626221, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8425354957580566, "step": 161 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6014705882352941, "calib/avg_num_step_conf": 2.7109375, "calib/ece": 0.1607874015748033, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9566929133858267, "calib/gap": 0.011984313725490359, "calib/mean_conf": 0.9584251968503937, "calib/mu_c": 0.9607843137254903, "calib/mu_w": 0.9488, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1580314960629923, "calib/std_conf": 0.02818457714319619, "calib/step_conf_rate": 0.984375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 634.78515625, "completions/mean_terminated_length": 647.4302978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 296.0, "epoch": 0.1728, "grad_norm": 0.010716700926423073, "learning_rate": 1.0555555555555557e-06, "loss": -0.0156, "num_tokens": 45074512.0, "reward": 1.3133634328842163, "reward_std": 0.1944691240787506, "rewards/accuracy_reward_step": 0.796875, "rewards/final_brier_reward_step": 0.8023550510406494, "rewards/format_reward_step": 0.9765625, "rewards/stepwise_brier_reward": 0.8729736804962158, "step": 162 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5619202226345084, "calib/avg_num_step_conf": 2.70703125, "calib/ece": 0.34496031746031774, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9325396825396826, "calib/gap": 0.015111317254174472, "calib/mean_conf": 0.9560714285714287, "calib/mu_c": 0.961948051948052, "calib/mu_w": 0.9468367346938775, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34496031746031774, "calib/std_conf": 0.03660225480334978, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 688.7265625, "completions/mean_terminated_length": 696.893310546875, "completions/min_length": 0.0, "completions/min_terminated_length": 294.0, "epoch": 0.17386666666666667, "grad_norm": 0.008968646638095379, "learning_rate": 1.0277777777777777e-06, "loss": 0.0074, "num_tokens": 45355658.0, "reward": 1.062105655670166, "reward_std": 0.28513896465301514, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6390472650527954, "rewards/format_reward_step": 0.984375, "rewards/stepwise_brier_reward": 0.8093751668930054, "step": 163 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5284182110024807, "calib/avg_num_step_conf": 2.4921875, "calib/ece": 0.2602352941176473, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9686274509803922, "calib/gap": 0.010157595213775239, "calib/mean_conf": 0.9559215686274509, "calib/mu_c": 0.9589887640449437, "calib/mu_w": 0.9488311688311685, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.259058823529412, "calib/std_conf": 0.06242040991542749, "calib/step_conf_rate": 0.984375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2611.0, "completions/max_terminated_length": 2611.0, "completions/mean_length": 739.3046875, "completions/mean_terminated_length": 748.0711669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 338.0, "epoch": 0.17493333333333333, "grad_norm": 0.012120206840336323, "learning_rate": 1.0000000000000002e-06, "loss": -0.0022, "num_tokens": 45651056.0, "reward": 1.179457426071167, "reward_std": 0.3007351756095886, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7120116949081421, "rewards/format_reward_step": 0.98046875, "rewards/stepwise_brier_reward": 0.8323806524276733, "step": 164 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5171819110884006, "calib/avg_num_step_conf": 2.578125, "calib/ece": 0.36580392156862745, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9764705882352941, "calib/gap": 0.0008776188042923794, "calib/mean_conf": 0.9618823529411766, "calib/mu_c": 0.9622368421052631, "calib/mu_w": 0.9613592233009707, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.36580392156862745, "calib/std_conf": 0.016930824724041556, "calib/step_conf_rate": 0.9921875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1256.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 705.7890625, "completions/mean_terminated_length": 716.9921264648438, "completions/min_length": 0.0, "completions/min_terminated_length": 272.0, "epoch": 0.176, "grad_norm": 0.016318166628479958, "learning_rate": 9.722222222222224e-07, "loss": 0.015, "num_tokens": 45937314.0, "reward": 1.0466334819793701, "reward_std": 0.2265051007270813, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6153144240379333, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8009068965911865, "step": 165 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6354635074368731, "calib/avg_num_step_conf": 2.484375, "calib/ece": 0.1870980392156864, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9450980392156862, "calib/gap": 0.03920269802836385, "calib/mean_conf": 0.9557254901960784, "calib/mu_c": 0.9647959183673468, "calib/mu_w": 0.925593220338983, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1870980392156864, "calib/std_conf": 0.06063199130704637, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1762.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 702.08984375, "completions/mean_terminated_length": 713.2341918945312, "completions/min_length": 0.0, "completions/min_terminated_length": 276.0, "epoch": 0.17706666666666668, "grad_norm": 0.00895922165364027, "learning_rate": 9.444444444444445e-07, "loss": -0.0221, "num_tokens": 46223233.0, "reward": 1.2920867204666138, "reward_std": 0.19075867533683777, "rewards/accuracy_reward_step": 0.765625, "rewards/final_brier_reward_step": 0.7900984287261963, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.9204358458518982, "step": 166 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.556592039800995, "calib/avg_num_step_conf": 2.77734375, "calib/ece": 0.16636363636363638, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9881422924901185, "calib/gap": 0.022588021431305227, "calib/mean_conf": 0.9608300395256918, "calib/mu_c": 0.9654726368159205, "calib/mu_w": 0.9428846153846153, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.16636363636363638, "calib/std_conf": 0.06275385416252457, "calib/step_conf_rate": 0.98828125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1608.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 707.19140625, "completions/mean_terminated_length": 718.4166870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 282.0, "epoch": 0.17813333333333334, "grad_norm": 0.009750531055033207, "learning_rate": 9.166666666666666e-07, "loss": -0.0063, "num_tokens": 46509882.0, "reward": 1.2959802150726318, "reward_std": 0.2873036861419678, "rewards/accuracy_reward_step": 0.79296875, "rewards/final_brier_reward_step": 0.7948437333106995, "rewards/format_reward_step": 0.9765625, "rewards/stepwise_brier_reward": 0.8265774250030518, "step": 167 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5303853397073736, "calib/avg_num_step_conf": 2.59765625, "calib/ece": 0.26858823529411757, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.984313725490196, "calib/gap": 0.0007583659278577182, "calib/mean_conf": 0.9627058823529412, "calib/mu_c": 0.9629378531073448, "calib/mu_w": 0.9621794871794871, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.26858823529411757, "calib/std_conf": 0.01582167056652271, "calib/step_conf_rate": 0.9921875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2580.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 752.1953125, "completions/mean_terminated_length": 761.1146850585938, "completions/min_length": 0.0, "completions/min_terminated_length": 269.0, "epoch": 0.1792, "grad_norm": 0.015377267263829708, "learning_rate": 8.88888888888889e-07, "loss": 0.0179, "num_tokens": 46807116.0, "reward": 1.1735761165618896, "reward_std": 0.3184884786605835, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.705013632774353, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8283531665802002, "step": 168 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5650406504065042, "calib/avg_num_step_conf": 2.75390625, "calib/ece": 0.31622047244094503, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9645669291338582, "calib/gap": 0.006368563685636874, "calib/mean_conf": 0.9618897637795276, "calib/mu_c": 0.9641463414634147, "calib/mu_w": 0.9577777777777778, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.31622047244094503, "calib/std_conf": 0.027009251359475295, "calib/step_conf_rate": 0.9921875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1396.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 697.703125, "completions/mean_terminated_length": 708.77783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 225.0, "epoch": 0.18026666666666666, "grad_norm": 0.011332076974213123, "learning_rate": 8.611111111111112e-07, "loss": 0.0092, "num_tokens": 47089912.0, "reward": 1.1101481914520264, "reward_std": 0.23798193037509918, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.664088249206543, "rewards/format_reward_step": 0.984375, "rewards/stepwise_brier_reward": 0.8202548027038574, "step": 169 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6006088506088506, "calib/avg_num_step_conf": 2.75390625, "calib/ece": 0.25687499999999996, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.98828125, "calib/gap": 0.0020255420255419487, "calib/mean_conf": 0.9626562500000002, "calib/mu_c": 0.9632417582417581, "calib/mu_w": 0.9612162162162161, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.254296875, "calib/std_conf": 0.04076388519188891, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1396.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 721.38671875, "completions/mean_terminated_length": 732.8373413085938, "completions/min_length": 0.0, "completions/min_terminated_length": 268.0, "epoch": 0.18133333333333335, "grad_norm": 0.01321347989141941, "learning_rate": 8.333333333333333e-07, "loss": -0.0073, "num_tokens": 47378739.0, "reward": 1.2024147510528564, "reward_std": 0.20902082324028015, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7299222946166992, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8375494480133057, "step": 170 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5430603613994618, "calib/avg_num_step_conf": 2.9296875, "calib/ece": 0.36250980392156873, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9803921568627451, "calib/gap": 0.010457516339869355, "calib/mean_conf": 0.9600000000000001, "calib/mu_c": 0.9641830065359476, "calib/mu_w": 0.9537254901960782, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3612549019607844, "calib/std_conf": 0.04710438970427028, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 675.34765625, "completions/mean_terminated_length": 686.0675048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 292.0, "epoch": 0.1824, "grad_norm": 0.012451022863388062, "learning_rate": 8.055555555555557e-07, "loss": -0.0125, "num_tokens": 47658524.0, "reward": 1.0445964336395264, "reward_std": 0.23366421461105347, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6304210424423218, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.7604649066925049, "step": 171 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5827322308233638, "calib/avg_num_step_conf": 2.84375, "calib/ece": 0.19255905511811025, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.984251968503937, "calib/gap": 0.015737156931738272, "calib/mean_conf": 0.9642125984251968, "calib/mu_c": 0.9678061224489797, "calib/mu_w": 0.9520689655172414, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.19255905511811025, "calib/std_conf": 0.028491780807158175, "calib/step_conf_rate": 0.98828125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 705.26953125, "completions/mean_terminated_length": 713.6324462890625, "completions/min_length": 0.0, "completions/min_terminated_length": 376.0, "epoch": 0.18346666666666667, "grad_norm": 0.01179542113095522, "learning_rate": 7.777777777777779e-07, "loss": -0.0062, "num_tokens": 47942425.0, "reward": 1.2731413841247559, "reward_std": 0.28811055421829224, "rewards/accuracy_reward_step": 0.765625, "rewards/final_brier_reward_step": 0.7803664207458496, "rewards/format_reward_step": 0.96875, "rewards/stepwise_brier_reward": 0.8621994256973267, "step": 172 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5632071632071632, "calib/avg_num_step_conf": 2.80859375, "calib/ece": 0.22102362204724413, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.984251968503937, "calib/gap": 0.0066715506715505635, "calib/mean_conf": 0.9651181102362205, "calib/mu_c": 0.9668253968253968, "calib/mu_w": 0.9601538461538462, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22102362204724413, "calib/std_conf": 0.017961732801923386, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 685.6796875, "completions/mean_terminated_length": 696.5635375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 249.0, "epoch": 0.18453333333333333, "grad_norm": 0.018552782014012337, "learning_rate": 7.5e-07, "loss": -0.0272, "num_tokens": 48221119.0, "reward": 1.2369461059570312, "reward_std": 0.22380010783672333, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7569882869720459, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8407962918281555, "step": 173 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5303921568627452, "calib/avg_num_step_conf": 2.8515625, "calib/ece": 0.2878884462151396, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9800796812749004, "calib/gap": 0.006553376906318076, "calib/mean_conf": 0.9651792828685258, "calib/mu_c": 0.9672941176470589, "calib/mu_w": 0.9607407407407408, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2878884462151396, "calib/std_conf": 0.027662373388372775, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2801.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 779.80859375, "completions/mean_terminated_length": 789.0553588867188, "completions/min_length": 0.0, "completions/min_terminated_length": 326.0, "epoch": 0.1856, "grad_norm": 0.012799207121133804, "learning_rate": 7.222222222222222e-07, "loss": 0.0003, "num_tokens": 48524982.0, "reward": 1.1339629888534546, "reward_std": 0.34867727756500244, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6868125200271606, "rewards/format_reward_step": 0.9765625, "rewards/stepwise_brier_reward": 0.8021644949913025, "step": 174 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5194558683960772, "calib/avg_num_step_conf": 2.97265625, "calib/ece": 0.38657480314960646, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9488188976377953, "calib/gap": 0.023962670041125955, "calib/mean_conf": 0.9574409448818898, "calib/mu_c": 0.9677241379310344, "calib/mu_w": 0.9437614678899084, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38657480314960646, "calib/std_conf": 0.06315325111715353, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2540.0, "completions/max_terminated_length": 2540.0, "completions/mean_length": 767.140625, "completions/mean_terminated_length": 776.2371826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 235.0, "epoch": 0.18666666666666668, "grad_norm": 0.01159517653286457, "learning_rate": 6.944444444444446e-07, "loss": -0.0186, "num_tokens": 48827194.0, "reward": 1.015672206878662, "reward_std": 0.30184462666511536, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6085425615310669, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.791646420955658, "step": 175 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5602439720359539, "calib/avg_num_step_conf": 3.07421875, "calib/ece": 0.3039357429718878, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9598393574297188, "calib/gap": 0.03402767869881573, "calib/mean_conf": 0.9585542168674699, "calib/mu_c": 0.9703067484662574, "calib/mu_w": 0.9362790697674417, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3039357429718878, "calib/std_conf": 0.09441171729047246, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2255.0, "completions/max_terminated_length": 2255.0, "completions/mean_length": 752.1484375, "completions/mean_terminated_length": 764.0873413085938, "completions/min_length": 0.0, "completions/min_terminated_length": 298.0, "epoch": 0.18773333333333334, "grad_norm": 0.009078577160835266, "learning_rate": 6.666666666666667e-07, "loss": -0.0389, "num_tokens": 49123808.0, "reward": 1.1046640872955322, "reward_std": 0.2785027027130127, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6649796962738037, "rewards/format_reward_step": 0.96484375, "rewards/stepwise_brier_reward": 0.8208639621734619, "step": 176 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4949391171993912, "calib/avg_num_step_conf": 3.30078125, "calib/ece": 0.2566403162055337, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9762845849802372, "calib/gap": -0.002200152207001538, "calib/mean_conf": 0.9680237154150197, "calib/mu_c": 0.9673888888888889, "calib/mu_w": 0.9695890410958904, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25660079051383405, "calib/std_conf": 0.023692017443508632, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1698.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 753.6171875, "completions/mean_terminated_length": 765.5794067382812, "completions/min_length": 0.0, "completions/min_terminated_length": 278.0, "epoch": 0.1888, "grad_norm": 0.016456032171845436, "learning_rate": 6.388888888888889e-07, "loss": -0.0347, "num_tokens": 49420566.0, "reward": 1.1904031038284302, "reward_std": 0.2102235108613968, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7189035415649414, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8348963856697083, "step": 177 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5078804347826087, "calib/avg_num_step_conf": 3.4609375, "calib/ece": 0.2466929133858268, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.007635093167701945, "calib/mean_conf": 0.9711023622047245, "calib/mu_c": 0.9732065217391306, "calib/mu_w": 0.9655714285714286, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2466929133858268, "calib/std_conf": 0.033934358739799476, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2121.0, "completions/max_terminated_length": 2121.0, "completions/mean_length": 753.9296875, "completions/mean_terminated_length": 762.8695678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 255.0, "epoch": 0.18986666666666666, "grad_norm": 0.01947125233709812, "learning_rate": 6.111111111111112e-07, "loss": 0.0028, "num_tokens": 49719644.0, "reward": 1.2093262672424316, "reward_std": 0.27125757932662964, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7354522943496704, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8315401673316956, "step": 178 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.573144418872267, "calib/avg_num_step_conf": 3.53125, "calib/ece": 0.28070588235294125, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.984313725490196, "calib/gap": 0.005341628308400792, "calib/mean_conf": 0.9709019607843138, "calib/mu_c": 0.9725568181818183, "calib/mu_w": 0.9672151898734175, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28070588235294125, "calib/std_conf": 0.014778435404942907, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2102.0, "completions/max_terminated_length": 2102.0, "completions/mean_length": 782.10546875, "completions/mean_terminated_length": 794.5198974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 384.0, "epoch": 0.19093333333333334, "grad_norm": 0.01893046498298645, "learning_rate": 5.833333333333334e-07, "loss": -0.0268, "num_tokens": 50026127.0, "reward": 1.1743907928466797, "reward_std": 0.25549614429473877, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7066734433174133, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8424519300460815, "step": 179 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5471780534684606, "calib/avg_num_step_conf": 3.48046875, "calib/ece": 0.20573122529644283, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": 0.004183120740870572, "calib/mean_conf": 0.9725296442687748, "calib/mu_c": 0.9735051546391755, "calib/mu_w": 0.969322033898305, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20573122529644283, "calib/std_conf": 0.012316804239407004, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 831.94140625, "completions/mean_terminated_length": 841.8063354492188, "completions/min_length": 0.0, "completions/min_terminated_length": 323.0, "epoch": 0.192, "grad_norm": 0.009832249954342842, "learning_rate": 5.555555555555555e-07, "loss": 0.0098, "num_tokens": 50342960.0, "reward": 1.2661899328231812, "reward_std": 0.2089243233203888, "rewards/accuracy_reward_step": 0.7578125, "rewards/final_brier_reward_step": 0.7710573673248291, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8671398162841797, "step": 180 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5299408014571949, "calib/avg_num_step_conf": 3.78515625, "calib/ece": 0.2548235294117647, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9882352941176471, "calib/gap": 0.0021994535519124225, "calib/mean_conf": 0.9707450980392156, "calib/mu_c": 0.9713661202185792, "calib/mu_w": 0.9691666666666667, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2539607843137255, "calib/std_conf": 0.017417416500141687, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 766.52734375, "completions/mean_terminated_length": 778.6945190429688, "completions/min_length": 0.0, "completions/min_terminated_length": 415.0, "epoch": 0.19306666666666666, "grad_norm": 0.01908833347260952, "learning_rate": 5.277777777777779e-07, "loss": -0.0177, "num_tokens": 50645455.0, "reward": 1.206012487411499, "reward_std": 0.2718864381313324, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7310327887535095, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8352043628692627, "step": 181 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5840121811952798, "calib/avg_num_step_conf": 3.91796875, "calib/ece": 0.2506640625000003, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0032303007232585834, "calib/mean_conf": 0.9733203125000003, "calib/mu_c": 0.9742162162162163, "calib/mu_w": 0.9709859154929578, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2506640625000003, "calib/std_conf": 0.00898734665528953, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1403.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 803.19140625, "completions/mean_terminated_length": 815.9405517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 389.0, "epoch": 0.19413333333333332, "grad_norm": 0.009503989480435848, "learning_rate": 5.000000000000001e-07, "loss": 0.019, "num_tokens": 50957232.0, "reward": 1.2067129611968994, "reward_std": 0.20590227842330933, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7374964952468872, "rewards/format_reward_step": 0.9921875, "rewards/stepwise_brier_reward": 0.8018555045127869, "step": 182 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6651111111111112, "calib/avg_num_step_conf": 3.953125, "calib/ece": 0.2600392156862745, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9764705882352941, "calib/gap": 0.03275555555555598, "calib/mean_conf": 0.9659215686274512, "calib/mu_c": 0.9755555555555557, "calib/mu_w": 0.9427999999999997, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2600392156862745, "calib/std_conf": 0.06917322476934545, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2410.0, "completions/max_terminated_length": 2410.0, "completions/mean_length": 852.046875, "completions/mean_terminated_length": 865.5714721679688, "completions/min_length": 0.0, "completions/min_terminated_length": 441.0, "epoch": 0.1952, "grad_norm": 0.013012172654271126, "learning_rate": 4.7222222222222226e-07, "loss": -0.0007, "num_tokens": 51282036.0, "reward": 1.2002928256988525, "reward_std": 0.2875397801399231, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7307175397872925, "rewards/format_reward_step": 0.99609375, "rewards/stepwise_brier_reward": 0.8595161437988281, "step": 183 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5417582417582417, "calib/avg_num_step_conf": 4.26171875, "calib/ece": 0.250595238095238, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0017252747252752831, "calib/mean_conf": 0.9728174603174602, "calib/mu_c": 0.9732967032967035, "calib/mu_w": 0.9715714285714282, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.250595238095238, "calib/std_conf": 0.009449028499362185, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 827.23828125, "completions/mean_terminated_length": 843.7171630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 278.0, "epoch": 0.19626666666666667, "grad_norm": 0.008613526821136475, "learning_rate": 4.444444444444445e-07, "loss": -0.0042, "num_tokens": 51599089.0, "reward": 1.192449688911438, "reward_std": 0.2646951675415039, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7253628969192505, "rewards/format_reward_step": 0.98046875, "rewards/stepwise_brier_reward": 0.808498203754425, "step": 184 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6991762907608695, "calib/avg_num_step_conf": 4.16015625, "calib/ece": 0.2270161290322583, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9758064516129032, "calib/gap": 0.02006793478260871, "calib/mean_conf": 0.9689516129032261, "calib/mu_c": 0.9741304347826087, "calib/mu_w": 0.9540625, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2270161290322583, "calib/std_conf": 0.03065990731808206, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2234.0, "completions/max_terminated_length": 2234.0, "completions/mean_length": 832.94140625, "completions/mean_terminated_length": 859.8104858398438, "completions/min_length": 0.0, "completions/min_terminated_length": 291.0, "epoch": 0.19733333333333333, "grad_norm": 0.01702149398624897, "learning_rate": 4.1666666666666667e-07, "loss": -0.033, "num_tokens": 51919242.0, "reward": 1.2030997276306152, "reward_std": 0.28247952461242676, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7397195100784302, "rewards/format_reward_step": 0.96484375, "rewards/stepwise_brier_reward": 0.8117418885231018, "step": 185 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6043947858472998, "calib/avg_num_step_conf": 4.71484375, "calib/ece": 0.2687401574803152, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9921259842519685, "calib/gap": 0.007754189944134127, "calib/mean_conf": 0.9734645669291341, "calib/mu_c": 0.9757541899441342, "calib/mu_w": 0.9680000000000001, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2687401574803152, "calib/std_conf": 0.017132092312883856, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1892.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 869.1796875, "completions/mean_terminated_length": 882.9762573242188, "completions/min_length": 0.0, "completions/min_terminated_length": 246.0, "epoch": 0.1984, "grad_norm": 0.01188287977129221, "learning_rate": 3.8888888888888895e-07, "loss": -0.001, "num_tokens": 52246792.0, "reward": 1.184190034866333, "reward_std": 0.22493651509284973, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7166726589202881, "rewards/format_reward_step": 0.98828125, "rewards/stepwise_brier_reward": 0.8278998732566833, "step": 186 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5337937114310902, "calib/avg_num_step_conf": 5.015625, "calib/ece": 0.310769230769231, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": 0.002234792829855947, "calib/mean_conf": 0.9747368421052635, "calib/mu_c": 0.9754878048780489, "calib/mu_w": 0.9732530120481929, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.310769230769231, "calib/std_conf": 0.01036927389163999, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2553.0, "completions/max_terminated_length": 2553.0, "completions/mean_length": 889.37109375, "completions/mean_terminated_length": 907.087646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 324.0, "epoch": 0.19946666666666665, "grad_norm": 0.011365193873643875, "learning_rate": 3.611111111111111e-07, "loss": -0.0172, "num_tokens": 52576015.0, "reward": 1.093395709991455, "reward_std": 0.391842782497406, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6570945382118225, "rewards/format_reward_step": 0.9609375, "rewards/stepwise_brier_reward": 0.7539879083633423, "step": 187 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5803228900255755, "calib/avg_num_step_conf": 6.0078125, "calib/ece": 0.24523809523809537, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0029603580562662657, "calib/mean_conf": 0.9753968253968255, "calib/mu_c": 0.9761956521739132, "calib/mu_w": 0.973235294117647, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24523809523809537, "calib/std_conf": 0.008227981068377426, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2275.0, "completions/max_terminated_length": 2275.0, "completions/mean_length": 890.671875, "completions/mean_terminated_length": 908.4143676757812, "completions/min_length": 0.0, "completions/min_terminated_length": 423.0, "epoch": 0.20053333333333334, "grad_norm": 0.013773644343018532, "learning_rate": 3.3333333333333335e-07, "loss": -0.0265, "num_tokens": 52908099.0, "reward": 1.1965887546539307, "reward_std": 0.30136632919311523, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7273281216621399, "rewards/format_reward_step": 0.96484375, "rewards/stepwise_brier_reward": 0.7980896234512329, "step": 188 }, { "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.573721716132824, "calib/avg_num_step_conf": 7.5234375, "calib/ece": 0.30983805668016223, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": 0.0055502497796063865, "calib/mean_conf": 0.9738056680161945, "calib/mu_c": 0.9756707317073173, "calib/mu_w": 0.9701204819277109, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30983805668016223, "calib/std_conf": 0.018222941458610812, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2010.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 868.1953125, "completions/mean_terminated_length": 896.2015991210938, "completions/min_length": 0.0, "completions/min_terminated_length": 490.0, "epoch": 0.2016, "grad_norm": 0.015437912195920944, "learning_rate": 3.055555555555556e-07, "loss": -0.0532, "num_tokens": 53238125.0, "reward": 1.0748528242111206, "reward_std": 0.3838985860347748, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6528773307800293, "rewards/format_reward_step": 0.9140625, "rewards/stepwise_brier_reward": 0.7184088230133057, "step": 189 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4980392156862745, "calib/avg_num_step_conf": 6.46875, "calib/ece": 0.2963745019920321, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9760956175298805, "calib/gap": -0.0028169934640523975, "calib/mean_conf": 0.9692031872509962, "calib/mu_c": 0.9682941176470589, "calib/mu_w": 0.9711111111111113, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.29414342629482093, "calib/std_conf": 0.0385618268736514, "calib/step_conf_rate": 0.9921875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1613.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 963.625, "completions/mean_terminated_length": 982.8207397460938, "completions/min_length": 0.0, "completions/min_terminated_length": 465.0, "epoch": 0.20266666666666666, "grad_norm": 0.013317610137164593, "learning_rate": 2.7777777777777776e-07, "loss": -0.0263, "num_tokens": 53590421.0, "reward": 1.116121530532837, "reward_std": 0.28696075081825256, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6715328097343445, "rewards/format_reward_step": 0.9609375, "rewards/stepwise_brier_reward": 0.7523280382156372, "step": 190 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5665102246061013, "calib/avg_num_step_conf": 6.9375, "calib/ece": 0.3537698412698414, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9880952380952381, "calib/gap": 0.0007274555816290107, "calib/mean_conf": 0.9736111111111112, "calib/mu_c": 0.9738853503184713, "calib/mu_w": 0.9731578947368423, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3521825396825398, "calib/std_conf": 0.02835530582812857, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 943.1328125, "completions/mean_terminated_length": 958.1032104492188, "completions/min_length": 0.0, "completions/min_terminated_length": 359.0, "epoch": 0.20373333333333332, "grad_norm": 0.011955502443015575, "learning_rate": 2.5000000000000004e-07, "loss": -0.0078, "num_tokens": 53936031.0, "reward": 1.0461554527282715, "reward_std": 0.3568416237831116, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6269733905792236, "rewards/format_reward_step": 0.9609375, "rewards/stepwise_brier_reward": 0.7201483845710754, "step": 191 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5893647885173309, "calib/avg_num_step_conf": 7.51953125, "calib/ece": 0.27015936254980094, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": 0.003929607573675331, "calib/mean_conf": 0.9753386454183268, "calib/mu_c": 0.9764971751412431, "calib/mu_w": 0.9725675675675678, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27015936254980094, "calib/std_conf": 0.012567937315419416, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1946.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 961.21875, "completions/mean_terminated_length": 980.3665771484375, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.2048, "grad_norm": 0.011386622674763203, "learning_rate": 2.2222222222222224e-07, "loss": -0.0148, "num_tokens": 54287079.0, "reward": 1.1587059497833252, "reward_std": 0.34305843710899353, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7046706676483154, "rewards/format_reward_step": 0.94921875, "rewards/stepwise_brier_reward": 0.7848403453826904, "step": 192 }, { "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.606986215538847, "calib/avg_num_step_conf": 8.3515625, "calib/ece": 0.26475409836065594, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.9713114754098361, "calib/gap": 0.07432330827067657, "calib/mean_conf": 0.9532786885245904, "calib/mu_c": 0.9764285714285715, "calib/mu_w": 0.902105263157895, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.26475409836065594, "calib/std_conf": 0.13542248779858804, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2680.0, "completions/max_terminated_length": 2680.0, "completions/mean_length": 942.484375, "completions/mean_terminated_length": 984.7999877929688, "completions/min_length": 0.0, "completions/min_terminated_length": 433.0, "epoch": 0.20586666666666667, "grad_norm": 0.012017766013741493, "learning_rate": 1.9444444444444447e-07, "loss": -0.0236, "num_tokens": 54634067.0, "reward": 1.1044975519180298, "reward_std": 0.42620962858200073, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6860804557800293, "rewards/format_reward_step": 0.92578125, "rewards/stepwise_brier_reward": 0.7365971803665161, "step": 193 }, { "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.542334714465862, "calib/avg_num_step_conf": 9.66015625, "calib/ece": 0.22716049382716064, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0024959466762748406, "calib/mean_conf": 0.9761316872427985, "calib/mu_c": 0.976758241758242, "calib/mu_w": 0.9742622950819672, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22716049382716064, "calib/std_conf": 0.008404336037510705, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2678.0, "completions/max_terminated_length": 2678.0, "completions/mean_length": 921.1953125, "completions/mean_terminated_length": 962.5550537109375, "completions/min_length": 0.0, "completions/min_terminated_length": 407.0, "epoch": 0.20693333333333333, "grad_norm": 0.020082490518689156, "learning_rate": 1.6666666666666668e-07, "loss": 0.0045, "num_tokens": 54975837.0, "reward": 1.1723902225494385, "reward_std": 0.3351433575153351, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7213640213012695, "rewards/format_reward_step": 0.92578125, "rewards/stepwise_brier_reward": 0.754134476184845, "step": 194 }, { "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.6122259956776783, "calib/avg_num_step_conf": 8.9453125, "calib/ece": 0.30074074074074103, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.9958847736625515, "calib/gap": 0.005727848101265809, "calib/mean_conf": 0.9756378600823047, "calib/mu_c": 0.9775000000000001, "calib/mu_w": 0.9717721518987343, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30074074074074103, "calib/std_conf": 0.010844226562634146, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2545.0, "completions/max_terminated_length": 2545.0, "completions/mean_length": 958.8515625, "completions/mean_terminated_length": 997.8292236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 413.0, "epoch": 0.208, "grad_norm": 0.025525391101837158, "learning_rate": 1.3888888888888888e-07, "loss": -0.0445, "num_tokens": 55327287.0, "reward": 1.0741482973098755, "reward_std": 0.316531240940094, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6522351503372192, "rewards/format_reward_step": 0.921875, "rewards/stepwise_brier_reward": 0.713107705116272, "step": 195 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6035727174305305, "calib/avg_num_step_conf": 9.08984375, "calib/ece": 0.31584677419354856, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9879032258064516, "calib/gap": 0.014747744496571902, "calib/mean_conf": 0.9731048387096776, "calib/mu_c": 0.9781595092024543, "calib/mu_w": 0.9634117647058824, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31584677419354856, "calib/std_conf": 0.036550300856708705, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2772.0, "completions/max_terminated_length": 2772.0, "completions/mean_length": 956.91015625, "completions/mean_terminated_length": 983.8112182617188, "completions/min_length": 0.0, "completions/min_terminated_length": 432.0, "epoch": 0.20906666666666668, "grad_norm": 0.02355959638953209, "learning_rate": 1.1111111111111112e-07, "loss": -0.0291, "num_tokens": 55674800.0, "reward": 1.0759737491607666, "reward_std": 0.3116154372692108, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6548066139221191, "rewards/format_reward_step": 0.953125, "rewards/stepwise_brier_reward": 0.7209635972976685, "step": 196 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5651896180742335, "calib/avg_num_step_conf": 7.94140625, "calib/ece": 0.3963967611336035, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": 0.005576923076923035, "calib/mean_conf": 0.9753441295546562, "calib/mu_c": 0.9776923076923079, "calib/mu_w": 0.9721153846153848, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3963967611336035, "calib/std_conf": 0.0251429716668958, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2465.0, "completions/max_terminated_length": 2465.0, "completions/mean_length": 956.3359375, "completions/mean_terminated_length": 983.2208251953125, "completions/min_length": 0.0, "completions/min_terminated_length": 419.0, "epoch": 0.21013333333333334, "grad_norm": 0.024164782837033272, "learning_rate": 8.333333333333334e-08, "loss": -0.0418, "num_tokens": 56024678.0, "reward": 0.9724588394165039, "reward_std": 0.4731922745704651, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5791285037994385, "rewards/format_reward_step": 0.9453125, "rewards/stepwise_brier_reward": 0.6825816035270691, "step": 197 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5789682539682539, "calib/avg_num_step_conf": 7.6171875, "calib/ece": 0.30253012048192796, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9759036144578314, "calib/gap": 0.03386796536796555, "calib/mean_conf": 0.9651807228915663, "calib/mu_c": 0.9766060606060609, "calib/mu_w": 0.9427380952380954, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30253012048192796, "calib/std_conf": 0.072072499962767, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2344.0, "completions/max_terminated_length": 2344.0, "completions/mean_length": 984.859375, "completions/mean_terminated_length": 996.53759765625, "completions/min_length": 0.0, "completions/min_terminated_length": 326.0, "epoch": 0.2112, "grad_norm": 0.012946662493050098, "learning_rate": 5.555555555555556e-08, "loss": 0.0065, "num_tokens": 56382186.0, "reward": 1.102027416229248, "reward_std": 0.360604465007782, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6758777499198914, "rewards/format_reward_step": 0.97265625, "rewards/stepwise_brier_reward": 0.7650444507598877, "step": 198 }, { "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.575, "calib/avg_num_step_conf": 7.6796875, "calib/ece": 0.24390243902439038, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.9959349593495935, "calib/gap": 0.00559595959595971, "calib/mean_conf": 0.9756097560975612, "calib/mu_c": 0.9771111111111113, "calib/mu_w": 0.9715151515151516, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24390243902439038, "calib/std_conf": 0.013889596121857412, "calib/step_conf_rate": 0.99609375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2553.0, "completions/max_terminated_length": 2553.0, "completions/mean_length": 986.60546875, "completions/mean_terminated_length": 1014.34130859375, "completions/min_length": 0.0, "completions/min_terminated_length": 398.0, "epoch": 0.21226666666666666, "grad_norm": 0.019104544073343277, "learning_rate": 2.777777777777778e-08, "loss": -0.0177, "num_tokens": 56738957.0, "reward": 1.1704182624816895, "reward_std": 0.37890321016311646, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7120695114135742, "rewards/format_reward_step": 0.93359375, "rewards/stepwise_brier_reward": 0.7836658954620361, "step": 199 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5893601190476191, "calib/avg_num_step_conf": 7.0390625, "calib/ece": 0.29798387096774226, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9919354838709677, "calib/gap": 0.005761904761904746, "calib/mean_conf": 0.9754032258064518, "calib/mu_c": 0.9772619047619048, "calib/mu_w": 0.9715, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29798387096774226, "calib/std_conf": 0.012108327815904167, "calib/step_conf_rate": 1.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2662.0, "completions/max_terminated_length": 2662.0, "completions/mean_length": 975.93359375, "completions/mean_terminated_length": 999.3560180664062, "completions/min_length": 0.0, "completions/min_terminated_length": 411.0, "epoch": 0.21333333333333335, "grad_norm": 0.034536611288785934, "learning_rate": 0.0, "loss": 0.0016, "num_tokens": 57096844.0, "reward": 1.1061069965362549, "reward_std": 0.3216761648654938, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6724125146865845, "rewards/format_reward_step": 0.953125, "rewards/stepwise_brier_reward": 0.745764970779419, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.023940378739498554, "train_runtime": 14098.1282, "train_samples_per_second": 3.632, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 57096844, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }