Files
PureRL-7B-v6-fmt01-brierH-mid/trainer_state.json
ModelHub XC df336a5c12 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-7B-v6-fmt01-brierH-mid
Source: Original Platform
2026-06-03 22:33:23 +08:00

8442 lines
329 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.0703125,
"calib/ece": 0.48125000000000007,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.25,
"calib/gap": -0.008333333333333304,
"calib/mean_conf": 0.73125,
"calib/mu_c": 0.7250000000000001,
"calib/mu_w": 0.7333333333333334,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.03515625,
"calib/nonempty_step_conf_rate": 0.015625,
"calib/pce": 0.48125000000000007,
"calib/std_conf": 0.19990231989649343,
"calib/step_conf_rate": 0.015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 3065.0,
"completions/max_terminated_length": 3065.0,
"completions/mean_length": 655.78515625,
"completions/mean_terminated_length": 729.9173583984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.17658577859401703,
"learning_rate": 0.0,
"loss": 0.0118,
"num_tokens": 298505.0,
"reward": 0.01619849167764187,
"reward_std": 0.03863148391246796,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.009296875447034836,
"rewards/format_reward_step": 0.01171875,
"rewards/stepwise_brier_reward": 0.010262716561555862,
"step": 1
},
{
"calib/answer_extract_rate": 0.03515625,
"calib/avg_num_step_conf": 0.11328125,
"calib/ece": 0.6925,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.25,
"calib/gap": -0.1233333333333333,
"calib/mean_conf": 0.7925,
"calib/mu_c": 0.7,
"calib/mu_w": 0.8233333333333333,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.04296875,
"calib/nonempty_step_conf_rate": 0.02734375,
"calib/pce": 0.6175,
"calib/std_conf": 0.14889173919328097,
"calib/step_conf_rate": 0.02734375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 2953.0,
"completions/max_terminated_length": 2953.0,
"completions/mean_length": 687.84375,
"completions/mean_terminated_length": 762.2857055664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.30427029728889465,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.0042,
"num_tokens": 600921.0,
"reward": 0.019744617864489555,
"reward_std": 0.055846214294433594,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.007027734536677599,
"rewards/format_reward_step": 0.015625,
"rewards/stepwise_brier_reward": 0.011798003688454628,
"step": 2
},
{
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.0390625,
"calib/ece": 0.6499999999999999,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.65,
"calib/mu_c": NaN,
"calib/mu_w": 0.65,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.01953125,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/pce": 0.6499999999999999,
"calib/std_conf": 0.14719601443879743,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2880.0,
"completions/max_terminated_length": 2880.0,
"completions/mean_length": 740.953125,
"completions/mean_terminated_length": 803.7457885742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0032,
"grad_norm": 0.2135412096977234,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0033,
"num_tokens": 918901.0,
"reward": 0.005034895613789558,
"reward_std": 0.01424083486199379,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0054296874441206455,
"rewards/format_reward_step": 0.0078125,
"rewards/stepwise_brier_reward": 0.006155208684504032,
"step": 3
},
{
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.04296875,
"calib/ece": 0.48333333333333334,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.125,
"calib/mean_conf": 0.8166666666666668,
"calib/mu_c": 0.9,
"calib/mu_w": 0.775,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.01953125,
"calib/nonempty_step_conf_rate": 0.01171875,
"calib/pce": 0.48333333333333334,
"calib/std_conf": 0.11785113019775792,
"calib/step_conf_rate": 0.01171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 3011.0,
"completions/max_terminated_length": 3011.0,
"completions/mean_length": 700.9375,
"completions/mean_terminated_length": 815.6363525390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.1290050894021988,
"learning_rate": 7.5e-07,
"loss": 0.0077,
"num_tokens": 1227549.0,
"reward": 0.008362310007214546,
"reward_std": 0.023652182891964912,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.004609375260770321,
"rewards/format_reward_step": 0.0078125,
"rewards/stepwise_brier_reward": 0.005480488762259483,
"step": 4
},
{
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.06640625,
"calib/ece": 0.6642857142857141,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.14285714285714285,
"calib/gap": 0.050000000000000044,
"calib/mean_conf": 0.8071428571428572,
"calib/mu_c": 0.85,
"calib/mu_w": 0.7999999999999999,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.015625,
"calib/pce": 0.6642857142857141,
"calib/std_conf": 0.13477115902938006,
"calib/step_conf_rate": 0.015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 3037.0,
"completions/max_terminated_length": 3037.0,
"completions/mean_length": 680.9140625,
"completions/mean_terminated_length": 761.196533203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.22762255370616913,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0011,
"num_tokens": 1531591.0,
"reward": 0.01147377397865057,
"reward_std": 0.03245273604989052,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.008144531399011612,
"rewards/format_reward_step": 0.015625,
"rewards/stepwise_brier_reward": 0.0077310362830758095,
"step": 5
},
{
"calib/answer_extract_rate": 0.046875,
"calib/avg_num_step_conf": 0.09375,
"calib/ece": 0.6666666666666666,
"calib/final_conf_rate": 0.03515625,
"calib/format_rate": 0.0234375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.08750000000000002,
"calib/mean_conf": 0.7777777777777778,
"calib/mu_c": 0.7,
"calib/mu_w": 0.7875,
"calib/nonempty_final_conf_rate": 0.03515625,
"calib/nonempty_reasoning_rate": 0.05078125,
"calib/nonempty_step_conf_rate": 0.02734375,
"calib/pce": 0.6666666666666666,
"calib/std_conf": 0.10030816714037662,
"calib/step_conf_rate": 0.02734375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 3011.0,
"completions/max_terminated_length": 3011.0,
"completions/mean_length": 616.640625,
"completions/mean_terminated_length": 677.5107421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0064,
"grad_norm": 0.23622936010360718,
"learning_rate": 1.25e-06,
"loss": 0.0143,
"num_tokens": 1818443.0,
"reward": 0.019612066447734833,
"reward_std": 0.04767308384180069,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.012529296800494194,
"rewards/format_reward_step": 0.0234375,
"rewards/stepwise_brier_reward": 0.012764675542712212,
"step": 6
},
{
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.06640625,
"calib/ece": 0.53,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.033333333333333326,
"calib/mean_conf": 0.5700000000000001,
"calib/mu_c": 0.55,
"calib/mu_w": 0.5833333333333334,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.0234375,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/pce": 0.3500000000000001,
"calib/std_conf": 0.24000000000000002,
"calib/step_conf_rate": 0.01953125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15234375,
"completions/max_length": 3033.0,
"completions/max_terminated_length": 3033.0,
"completions/mean_length": 641.45703125,
"completions/mean_terminated_length": 756.741943359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.2200292944908142,
"learning_rate": 1.5e-06,
"loss": 0.0016,
"num_tokens": 2113120.0,
"reward": 0.02059093490242958,
"reward_std": 0.05823996290564537,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.009023437276482582,
"rewards/format_reward_step": 0.015625,
"rewards/stepwise_brier_reward": 0.011191869154572487,
"step": 7
},
{
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.046875,
"calib/ece": 0.15000000000000002,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.85,
"calib/mu_c": 0.85,
"calib/mu_w": NaN,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.02734375,
"calib/nonempty_step_conf_rate": 0.015625,
"calib/pce": 0.0,
"calib/std_conf": 0.04082482904638629,
"calib/step_conf_rate": 0.015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 3039.0,
"completions/max_terminated_length": 3039.0,
"completions/mean_length": 683.51171875,
"completions/mean_terminated_length": 754.2198486328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.3492337167263031,
"learning_rate": 1.75e-06,
"loss": 0.0032,
"num_tokens": 2417651.0,
"reward": 0.021268287673592567,
"reward_std": 0.06015579774975777,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.011435546912252903,
"rewards/format_reward_step": 0.01171875,
"rewards/stepwise_brier_reward": 0.0106395548209548,
"step": 8
},
{
"calib/answer_extract_rate": 0.0390625,
"calib/avg_num_step_conf": 0.046875,
"calib/ece": 0.3357142857142857,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.14285714285714285,
"calib/gap": 0.033333333333333326,
"calib/mean_conf": 0.7642857142857142,
"calib/mu_c": 0.7833333333333333,
"calib/mu_w": 0.75,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.0390625,
"calib/nonempty_step_conf_rate": 0.015625,
"calib/pce": 0.3357142857142857,
"calib/std_conf": 0.12737538928662148,
"calib/step_conf_rate": 0.015625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2833.0,
"completions/max_terminated_length": 2833.0,
"completions/mean_length": 603.69921875,
"completions/mean_terminated_length": 677.8377075195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0096,
"grad_norm": 0.32757800817489624,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0073,
"num_tokens": 2702774.0,
"reward": 0.022961322218179703,
"reward_std": 0.06494442373514175,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.009033203125,
"rewards/format_reward_step": 0.01171875,
"rewards/stepwise_brier_reward": 0.006591379642486572,
"step": 9
},
{
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 0.0703125,
"calib/ece": 0.5214285714285714,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.2857142857142857,
"calib/gap": -0.009999999999999898,
"calib/mean_conf": 0.807142857142857,
"calib/mu_c": 0.8,
"calib/mu_w": 0.8099999999999999,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.03125,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/pce": 0.5214285714285714,
"calib/std_conf": 0.11473127431577862,
"calib/step_conf_rate": 0.0234375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 3056.0,
"completions/max_terminated_length": 3056.0,
"completions/mean_length": 672.26171875,
"completions/mean_terminated_length": 729.2330322265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.25926557183265686,
"learning_rate": 2.25e-06,
"loss": -0.0039,
"num_tokens": 3004713.0,
"reward": 0.018368151038885117,
"reward_std": 0.05195297300815582,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.009648437611758709,
"rewards/format_reward_step": 0.01953125,
"rewards/stepwise_brier_reward": 0.015113226138055325,
"step": 10
},
{
"calib/answer_extract_rate": 0.0546875,
"calib/avg_num_step_conf": 0.23046875,
"calib/ece": 0.16153846153846155,
"calib/final_conf_rate": 0.05078125,
"calib/format_rate": 0.04296875,
"calib/frac_conf_gt_0.9": 0.07692307692307693,
"calib/gap": 0.014999999999999902,
"calib/mean_conf": 0.7615384615384614,
"calib/mu_c": 0.7649999999999999,
"calib/mu_w": 0.75,
"calib/nonempty_final_conf_rate": 0.05078125,
"calib/nonempty_reasoning_rate": 0.0625,
"calib/nonempty_step_conf_rate": 0.05078125,
"calib/pce": 0.07692307692307696,
"calib/std_conf": 0.12883180172649406,
"calib/step_conf_rate": 0.05078125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2961.0,
"completions/max_terminated_length": 2961.0,
"completions/mean_length": 706.15625,
"completions/mean_terminated_length": 792.877197265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.32639095187187195,
"learning_rate": 2.5e-06,
"loss": 0.0305,
"num_tokens": 3313009.0,
"reward": 0.07463431358337402,
"reward_std": 0.18631576001644135,
"rewards/accuracy_reward_step": 0.04296875,
"rewards/final_brier_reward_step": 0.03682617098093033,
"rewards/format_reward_step": 0.04296875,
"rewards/stepwise_brier_reward": 0.03582242131233215,
"step": 11
},
{
"calib/answer_extract_rate": 0.09765625,
"calib/avg_num_step_conf": 0.37890625,
"calib/ece": 0.2871428571428571,
"calib/final_conf_rate": 0.08203125,
"calib/format_rate": 0.06640625,
"calib/frac_conf_gt_0.9": 0.09523809523809523,
"calib/gap": 0.03277777777777757,
"calib/mean_conf": 0.7442857142857142,
"calib/mu_c": 0.7583333333333332,
"calib/mu_w": 0.7255555555555556,
"calib/nonempty_final_conf_rate": 0.08203125,
"calib/nonempty_reasoning_rate": 0.10546875,
"calib/nonempty_step_conf_rate": 0.08984375,
"calib/pce": 0.22999999999999998,
"calib/std_conf": 0.13692442922914774,
"calib/step_conf_rate": 0.08984375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2966.0,
"completions/max_terminated_length": 2966.0,
"completions/mean_length": 563.20703125,
"completions/mean_terminated_length": 600.7542114257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0128,
"grad_norm": 0.4486089050769806,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0015,
"num_tokens": 3584406.0,
"reward": 0.1078411191701889,
"reward_std": 0.16879743337631226,
"rewards/accuracy_reward_step": 0.0625,
"rewards/final_brier_reward_step": 0.04987538978457451,
"rewards/format_reward_step": 0.06640625,
"rewards/stepwise_brier_reward": 0.05505118519067764,
"step": 12
},
{
"calib/answer_extract_rate": 0.09765625,
"calib/avg_num_step_conf": 0.7265625,
"calib/ece": 0.6033333333333333,
"calib/final_conf_rate": 0.09375,
"calib/format_rate": 0.05859375,
"calib/frac_conf_gt_0.9": 0.125,
"calib/gap": 0.06600000000000006,
"calib/mean_conf": 0.7699999999999999,
"calib/mu_c": 0.825,
"calib/mu_w": 0.7589999999999999,
"calib/nonempty_final_conf_rate": 0.09375,
"calib/nonempty_reasoning_rate": 0.140625,
"calib/nonempty_step_conf_rate": 0.1171875,
"calib/pce": 0.6033333333333333,
"calib/std_conf": 0.16830032679706836,
"calib/step_conf_rate": 0.1171875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 3063.0,
"completions/max_terminated_length": 3063.0,
"completions/mean_length": 650.65625,
"completions/mean_terminated_length": 699.8656005859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.27529487013816833,
"learning_rate": 3e-06,
"loss": 0.0526,
"num_tokens": 3878606.0,
"reward": 0.05349308252334595,
"reward_std": 0.12776944041252136,
"rewards/accuracy_reward_step": 0.0234375,
"rewards/final_brier_reward_step": 0.029758203774690628,
"rewards/format_reward_step": 0.05859375,
"rewards/stepwise_brier_reward": 0.03726842254400253,
"step": 13
},
{
"calib/answer_extract_rate": 0.25,
"calib/avg_num_step_conf": 1.13671875,
"calib/ece": 0.4633333333333334,
"calib/final_conf_rate": 0.19921875,
"calib/format_rate": 0.1484375,
"calib/frac_conf_gt_0.9": 0.058823529411764705,
"calib/gap": 0.002000000000000224,
"calib/mean_conf": 0.733921568627451,
"calib/mu_c": 0.7353333333333334,
"calib/mu_w": 0.7333333333333332,
"calib/nonempty_final_conf_rate": 0.19921875,
"calib/nonempty_reasoning_rate": 0.29296875,
"calib/nonempty_step_conf_rate": 0.25390625,
"calib/pce": 0.4515686274509805,
"calib/std_conf": 0.14506358891504084,
"calib/step_conf_rate": 0.25390625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2898.0,
"completions/max_terminated_length": 2898.0,
"completions/mean_length": 566.8515625,
"completions/mean_terminated_length": 594.7294921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.6952277421951294,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0509,
"num_tokens": 4152160.0,
"reward": 0.16866666078567505,
"reward_std": 0.2768186032772064,
"rewards/accuracy_reward_step": 0.08203125,
"rewards/final_brier_reward_step": 0.0842718705534935,
"rewards/format_reward_step": 0.1484375,
"rewards/stepwise_brier_reward": 0.11862284690141678,
"step": 14
},
{
"calib/answer_extract_rate": 0.41015625,
"calib/avg_num_step_conf": 1.859375,
"calib/ece": 0.41095744680851054,
"calib/final_conf_rate": 0.3671875,
"calib/format_rate": 0.3046875,
"calib/frac_conf_gt_0.9": 0.11702127659574468,
"calib/gap": 0.019162227602905424,
"calib/mean_conf": 0.7588297872340425,
"calib/mu_c": 0.7708571428571429,
"calib/mu_w": 0.7516949152542375,
"calib/nonempty_final_conf_rate": 0.3671875,
"calib/nonempty_reasoning_rate": 0.48046875,
"calib/nonempty_step_conf_rate": 0.41796875,
"calib/pce": 0.39872340425531905,
"calib/std_conf": 0.14718533968810774,
"calib/step_conf_rate": 0.41796875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2689.0,
"completions/max_terminated_length": 2689.0,
"completions/mean_length": 483.1328125,
"completions/mean_terminated_length": 490.8016052246094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.016,
"grad_norm": 0.8166272044181824,
"learning_rate": 3.5e-06,
"loss": 0.0548,
"num_tokens": 4406762.0,
"reward": 0.3417610228061676,
"reward_std": 0.5316358208656311,
"rewards/accuracy_reward_step": 0.16015625,
"rewards/final_brier_reward_step": 0.1889699250459671,
"rewards/format_reward_step": 0.3046875,
"rewards/stepwise_brier_reward": 0.22660425305366516,
"step": 15
},
{
"calib/answer_extract_rate": 0.640625,
"calib/avg_num_step_conf": 3.7421875,
"calib/ece": 0.4356962025316456,
"calib/final_conf_rate": 0.6171875,
"calib/format_rate": 0.515625,
"calib/frac_conf_gt_0.9": 0.06962025316455696,
"calib/gap": 0.040874455732946235,
"calib/mean_conf": 0.7439240506329114,
"calib/mu_c": 0.7713461538461538,
"calib/mu_w": 0.7304716981132076,
"calib/nonempty_final_conf_rate": 0.6171875,
"calib/nonempty_reasoning_rate": 0.765625,
"calib/nonempty_step_conf_rate": 0.703125,
"calib/pce": 0.425253164556962,
"calib/std_conf": 0.15553434006853403,
"calib/step_conf_rate": 0.703125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2831.0,
"completions/max_terminated_length": 2831.0,
"completions/mean_length": 475.28125,
"completions/mean_terminated_length": 479.02362060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.7212015986442566,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0315,
"num_tokens": 4660322.0,
"reward": 0.5336143374443054,
"reward_std": 0.5757545828819275,
"rewards/accuracy_reward_step": 0.2265625,
"rewards/final_brier_reward_step": 0.31148359179496765,
"rewards/format_reward_step": 0.515625,
"rewards/stepwise_brier_reward": 0.39899012446403503,
"step": 16
},
{
"calib/answer_extract_rate": 0.87109375,
"calib/avg_num_step_conf": 4.921875,
"calib/ece": 0.3467136150234741,
"calib/final_conf_rate": 0.83203125,
"calib/format_rate": 0.79296875,
"calib/frac_conf_gt_0.9": 0.07511737089201878,
"calib/gap": 0.039069767441860526,
"calib/mean_conf": 0.7396713615023475,
"calib/mu_c": 0.7633333333333332,
"calib/mu_w": 0.7242635658914727,
"calib/nonempty_final_conf_rate": 0.83203125,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.89453125,
"calib/pce": 0.3460093896713614,
"calib/std_conf": 0.15670427570716403,
"calib/step_conf_rate": 0.89453125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1576.0,
"completions/max_terminated_length": 1576.0,
"completions/mean_length": 385.89453125,
"completions/mean_terminated_length": 387.4078674316406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.018133333333333335,
"grad_norm": 1.0878597497940063,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0649,
"num_tokens": 4885679.0,
"reward": 0.825513482093811,
"reward_std": 0.5877258777618408,
"rewards/accuracy_reward_step": 0.34375,
"rewards/final_brier_reward_step": 0.5036300420761108,
"rewards/format_reward_step": 0.79296875,
"rewards/stepwise_brier_reward": 0.6026062965393066,
"step": 17
},
{
"calib/answer_extract_rate": 0.8828125,
"calib/avg_num_step_conf": 4.65234375,
"calib/ece": 0.3434090909090908,
"calib/final_conf_rate": 0.859375,
"calib/format_rate": 0.80859375,
"calib/frac_conf_gt_0.9": 0.03636363636363636,
"calib/gap": 0.035937499999999956,
"calib/mean_conf": 0.7615909090909091,
"calib/mu_c": 0.7825,
"calib/mu_w": 0.7465625,
"calib/nonempty_final_conf_rate": 0.859375,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.90625,
"calib/pce": 0.3434090909090908,
"calib/std_conf": 0.10826615314748822,
"calib/step_conf_rate": 0.90625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2957.0,
"completions/max_terminated_length": 2957.0,
"completions/mean_length": 393.1171875,
"completions/mean_terminated_length": 393.1171875,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.0192,
"grad_norm": 0.529790461063385,
"learning_rate": 4.25e-06,
"loss": 0.0308,
"num_tokens": 5120077.0,
"reward": 0.8530287742614746,
"reward_std": 0.5156837701797485,
"rewards/accuracy_reward_step": 0.36328125,
"rewards/final_brier_reward_step": 0.5240890979766846,
"rewards/format_reward_step": 0.80859375,
"rewards/stepwise_brier_reward": 0.5873744487762451,
"step": 18
},
{
"calib/answer_extract_rate": 0.9453125,
"calib/avg_num_step_conf": 4.90234375,
"calib/ece": 0.32182203389830516,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.859375,
"calib/frac_conf_gt_0.9": 0.038135593220338986,
"calib/gap": 0.03419046206333909,
"calib/mean_conf": 0.725635593220339,
"calib/mu_c": 0.7457731958762887,
"calib/mu_w": 0.7115827338129496,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.3182203389830509,
"calib/std_conf": 0.13758209661650694,
"calib/step_conf_rate": 0.9765625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 889.0,
"completions/max_terminated_length": 889.0,
"completions/mean_length": 299.90625,
"completions/mean_terminated_length": 302.2677001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.6062198877334595,
"learning_rate": 4.5e-06,
"loss": -0.0175,
"num_tokens": 5324653.0,
"reward": 0.93487948179245,
"reward_std": 0.5158452987670898,
"rewards/accuracy_reward_step": 0.39453125,
"rewards/final_brier_reward_step": 0.5712933540344238,
"rewards/format_reward_step": 0.859375,
"rewards/stepwise_brier_reward": 0.6750562787055969,
"step": 19
},
{
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 5.3046875,
"calib/ece": 0.22338645418326694,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.02390438247011952,
"calib/gap": 0.01116274634456449,
"calib/mean_conf": 0.7054581673306772,
"calib/mu_c": 0.7112396694214876,
"calib/mu_w": 0.7000769230769232,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.22338645418326694,
"calib/std_conf": 0.1201198324442809,
"calib/step_conf_rate": 0.9765625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1251.0,
"completions/max_terminated_length": 1251.0,
"completions/mean_length": 308.921875,
"completions/mean_terminated_length": 310.13336181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.4974098205566406,
"learning_rate": 4.75e-06,
"loss": -0.011,
"num_tokens": 5531649.0,
"reward": 1.0871165990829468,
"reward_std": 0.47273990511894226,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.6622281074523926,
"rewards/format_reward_step": 0.953125,
"rewards/stepwise_brier_reward": 0.7521353363990784,
"step": 20
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 5.59765625,
"calib/ece": 0.20835294117647063,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.00784313725490196,
"calib/gap": 0.0202500000000001,
"calib/mean_conf": 0.6761960784313724,
"calib/mu_c": 0.6869166666666667,
"calib/mu_w": 0.6666666666666666,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2069803921568628,
"calib/std_conf": 0.11498695912552442,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 673.0,
"completions/max_terminated_length": 673.0,
"completions/mean_length": 296.0234375,
"completions/mean_terminated_length": 298.3543395996094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 61.0,
"epoch": 0.0224,
"grad_norm": 0.591964840888977,
"learning_rate": 5e-06,
"loss": 0.032,
"num_tokens": 5733431.0,
"reward": 1.120538592338562,
"reward_std": 0.46937960386276245,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7012964487075806,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8076863288879395,
"step": 21
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 5.44140625,
"calib/ece": 0.184404761904762,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.049787878787878714,
"calib/mean_conf": 0.657420634920635,
"calib/mu_c": 0.6835,
"calib/mu_w": 0.6337121212121213,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.1828174603174604,
"calib/std_conf": 0.1132987239294482,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 922.0,
"completions/max_terminated_length": 922.0,
"completions/mean_length": 289.27734375,
"completions/mean_terminated_length": 291.55511474609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 52.0,
"epoch": 0.023466666666666667,
"grad_norm": 1.960092544555664,
"learning_rate": 4.9722222222222224e-06,
"loss": -0.025,
"num_tokens": 5932342.0,
"reward": 1.1121745109558105,
"reward_std": 0.41868239641189575,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7098742723464966,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.7664496898651123,
"step": 22
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 5.30859375,
"calib/ece": 0.17490196078431378,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.016686335403726882,
"calib/mean_conf": 0.6258823529411763,
"calib/mu_c": 0.6350434782608697,
"calib/mu_w": 0.6183571428571428,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17490196078431378,
"calib/std_conf": 0.0993108897645984,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 826.0,
"completions/max_terminated_length": 826.0,
"completions/mean_length": 297.53515625,
"completions/mean_terminated_length": 299.8779602050781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.648651659488678,
"learning_rate": 4.944444444444445e-06,
"loss": 0.0017,
"num_tokens": 6135487.0,
"reward": 1.103767991065979,
"reward_std": 0.4504122734069824,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.7110418081283569,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.8008010387420654,
"step": 23
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 5.87890625,
"calib/ece": 0.2616470588235294,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.0043316624895572975,
"calib/mean_conf": 0.5706666666666667,
"calib/mu_c": 0.5735714285714286,
"calib/mu_w": 0.5692397660818713,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25145098039215685,
"calib/std_conf": 0.10516815045098513,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2029.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 316.91796875,
"completions/mean_terminated_length": 318.1607971191406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.0256,
"grad_norm": 1.1349138021469116,
"learning_rate": 4.9166666666666665e-06,
"loss": 0.0303,
"num_tokens": 6344170.0,
"reward": 0.9788568615913391,
"reward_std": 0.4376360774040222,
"rewards/accuracy_reward_step": 0.328125,
"rewards/final_brier_reward_step": 0.7089694738388062,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.7865509986877441,
"step": 24
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 6.01953125,
"calib/ece": 0.03484374999999991,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.042696697796496375,
"calib/mean_conf": 0.502421875,
"calib/mu_c": 0.5239370078740158,
"calib/mu_w": 0.4812403100775194,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.020585937499999887,
"calib/std_conf": 0.09926484534559239,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 692.0,
"completions/max_terminated_length": 692.0,
"completions/mean_length": 320.06640625,
"completions/mean_terminated_length": 322.58660888671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.5203309059143066,
"learning_rate": 4.888888888888889e-06,
"loss": 0.0108,
"num_tokens": 6552371.0,
"reward": 1.1763631105422974,
"reward_std": 0.37089306116104126,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7542519569396973,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.815698504447937,
"step": 25
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 6.01953125,
"calib/ece": 0.04555118110236215,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.017968206656731267,
"calib/mean_conf": 0.45484251968503936,
"calib/mu_c": 0.4641803278688525,
"calib/mu_w": 0.4462121212121212,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.010039370078740138,
"calib/std_conf": 0.10860141131004274,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 686.0,
"completions/max_terminated_length": 686.0,
"completions/mean_length": 323.80078125,
"completions/mean_terminated_length": 326.35040283203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.5072885155677795,
"learning_rate": 4.861111111111111e-06,
"loss": 0.001,
"num_tokens": 6763544.0,
"reward": 1.1537525653839111,
"reward_std": 0.31776195764541626,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.7410793304443359,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.829727053642273,
"step": 26
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 5.90234375,
"calib/ece": 0.07996062992125977,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.02364525407478424,
"calib/mean_conf": 0.4087007874015748,
"calib/mu_c": 0.42257142857142854,
"calib/mu_w": 0.3989261744966443,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.037637795275590524,
"calib/std_conf": 0.11348577315259133,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 859.0,
"completions/max_terminated_length": 859.0,
"completions/mean_length": 327.40234375,
"completions/mean_terminated_length": 329.9803161621094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0288,
"grad_norm": 0.7500645518302917,
"learning_rate": 4.833333333333333e-06,
"loss": -0.0049,
"num_tokens": 6975615.0,
"reward": 1.0971195697784424,
"reward_std": 0.34505730867385864,
"rewards/accuracy_reward_step": 0.41015625,
"rewards/final_brier_reward_step": 0.7501621246337891,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8506540656089783,
"step": 27
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 6.1015625,
"calib/ece": 0.2410196078431372,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.032722222222222264,
"calib/mean_conf": 0.2934901960784314,
"calib/mu_c": 0.30888888888888894,
"calib/mu_w": 0.27616666666666667,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.002549019607843136,
"calib/std_conf": 0.13479386914390834,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1312.0,
"completions/max_terminated_length": 1312.0,
"completions/mean_length": 338.34765625,
"completions/mean_terminated_length": 341.0118103027344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.5649840235710144,
"learning_rate": 4.805555555555556e-06,
"loss": 0.0344,
"num_tokens": 7192216.0,
"reward": 1.1840614080429077,
"reward_std": 0.25355055928230286,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6906328201293945,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.8471675515174866,
"step": 28
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 6.66796875,
"calib/ece": 0.27051181102362204,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.014085941381023315,
"calib/mean_conf": 0.22161417322834648,
"calib/mu_c": 0.2289344262295082,
"calib/mu_w": 0.21484848484848487,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.005905511811023622,
"calib/std_conf": 0.12363328726673672,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1565.0,
"completions/max_terminated_length": 1565.0,
"completions/mean_length": 395.6953125,
"completions/mean_terminated_length": 398.81103515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.8290421962738037,
"learning_rate": 4.777777777777778e-06,
"loss": -0.0297,
"num_tokens": 7423682.0,
"reward": 1.122117042541504,
"reward_std": 0.25590378046035767,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6661832332611084,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.854539692401886,
"step": 29
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 5.703125,
"calib/ece": 0.22226562499999997,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.04535552942632587,
"calib/mean_conf": 0.230859375,
"calib/mu_c": 0.25619469026548675,
"calib/mu_w": 0.21083916083916088,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.005859374999999998,
"calib/std_conf": 0.12486259537831726,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1061.0,
"completions/max_terminated_length": 1061.0,
"completions/mean_length": 351.91015625,
"completions/mean_terminated_length": 354.68109130859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.032,
"grad_norm": 0.6820420622825623,
"learning_rate": 4.75e-06,
"loss": -0.0371,
"num_tokens": 7643795.0,
"reward": 1.1040761470794678,
"reward_std": 0.2836735248565674,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.7090429663658142,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8357192277908325,
"step": 30
},
{
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 6.03125,
"calib/ece": 0.16719921875000002,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.07440250015863953,
"calib/mean_conf": 0.23514453125,
"calib/mu_c": 0.2796116504854369,
"calib/mu_w": 0.20520915032679737,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0,
"calib/std_conf": 0.13737968674250123,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1105.0,
"completions/max_terminated_length": 1105.0,
"completions/mean_length": 369.02734375,
"completions/mean_terminated_length": 371.9330749511719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.865591287612915,
"learning_rate": 4.722222222222222e-06,
"loss": -0.0197,
"num_tokens": 7867218.0,
"reward": 1.0817630290985107,
"reward_std": 0.2301708161830902,
"rewards/accuracy_reward_step": 0.40234375,
"rewards/final_brier_reward_step": 0.7484901547431946,
"rewards/format_reward_step": 1.0,
"rewards/stepwise_brier_reward": 0.8206971883773804,
"step": 31
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 5.3046875,
"calib/ece": 0.2751764705882353,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.024264597191426485,
"calib/mean_conf": 0.22443137254901963,
"calib/mu_c": 0.2369918699186992,
"calib/mu_w": 0.2127272727272727,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.008627450980392156,
"calib/std_conf": 0.11848210833773733,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 841.0,
"completions/max_terminated_length": 841.0,
"completions/mean_length": 348.44921875,
"completions/mean_terminated_length": 351.1929016113281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.034133333333333335,
"grad_norm": 1.52852463722229,
"learning_rate": 4.694444444444445e-06,
"loss": 0.0016,
"num_tokens": 8086165.0,
"reward": 1.1227799654006958,
"reward_std": 0.20674574375152588,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.6792035102844238,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.8124001026153564,
"step": 32
},
{
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 5.4453125,
"calib/ece": 0.307578125,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.009687500000000016,
"calib/mean_conf": 0.192421875,
"calib/mu_c": 0.197265625,
"calib/mu_w": 0.18757812499999998,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0,
"calib/std_conf": 0.10805269326344613,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1120.0,
"completions/max_terminated_length": 1120.0,
"completions/mean_length": 367.27734375,
"completions/mean_terminated_length": 370.1692810058594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.0352,
"grad_norm": 1.088653326034546,
"learning_rate": 4.666666666666667e-06,
"loss": 0.01,
"num_tokens": 8310100.0,
"reward": 1.1242845058441162,
"reward_std": 0.20601904392242432,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6485640406608582,
"rewards/format_reward_step": 1.0,
"rewards/stepwise_brier_reward": 0.8000099658966064,
"step": 33
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 5.375,
"calib/ece": 0.35748031496062993,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.01682452642073773,
"calib/mean_conf": 0.18188976377952756,
"calib/mu_c": 0.18970588235294117,
"calib/mu_w": 0.17288135593220344,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0019685039370078744,
"calib/std_conf": 0.10140145740822037,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2223.0,
"completions/max_terminated_length": 2223.0,
"completions/mean_length": 362.05859375,
"completions/mean_terminated_length": 363.47845458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.03626666666666667,
"grad_norm": 1.0133696794509888,
"learning_rate": 4.638888888888889e-06,
"loss": 0.056,
"num_tokens": 8530939.0,
"reward": 1.1375608444213867,
"reward_std": 0.2840504050254822,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6156054735183716,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.7987202405929565,
"step": 34
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 4.53125,
"calib/ece": 0.3446484374999999,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.04685051958433248,
"calib/mean_conf": 0.19988281250000003,
"calib/mu_c": 0.22129496402877696,
"calib/mu_w": 0.17444444444444449,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.0007812499999999998,
"calib/std_conf": 0.11171770514153001,
"calib/step_conf_rate": 0.96875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1132.0,
"completions/max_terminated_length": 1132.0,
"completions/mean_length": 383.30078125,
"completions/mean_terminated_length": 386.31890869140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.037333333333333336,
"grad_norm": 1.2451777458190918,
"learning_rate": 4.611111111111112e-06,
"loss": 0.0221,
"num_tokens": 8761360.0,
"reward": 1.143646478652954,
"reward_std": 0.25098735094070435,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6174097657203674,
"rewards/format_reward_step": 0.96484375,
"rewards/stepwise_brier_reward": 0.7819544076919556,
"step": 35
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 3.7578125,
"calib/ece": 0.459764705882353,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.06412857142857142,
"calib/mean_conf": 0.22650980392156858,
"calib/mu_c": 0.24662857142857142,
"calib/mu_w": 0.1825,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.0,
"calib/std_conf": 0.09234618850463783,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 851.0,
"completions/max_terminated_length": 851.0,
"completions/mean_length": 308.08984375,
"completions/mean_terminated_length": 310.5157470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.0384,
"grad_norm": 1.2867798805236816,
"learning_rate": 4.583333333333333e-06,
"loss": 0.0763,
"num_tokens": 8965983.0,
"reward": 1.2754257917404175,
"reward_std": 0.25121673941612244,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.5783679485321045,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.8184047937393188,
"step": 36
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 3.9296875,
"calib/ece": 0.23402390438247014,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.0461296083684144,
"calib/mean_conf": 0.29984063745019923,
"calib/mu_c": 0.3213432835820896,
"calib/mu_w": 0.2752136752136752,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.0,
"calib/std_conf": 0.11390905762211623,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1939.0,
"completions/max_terminated_length": 1939.0,
"completions/mean_length": 381.1484375,
"completions/mean_terminated_length": 384.14959716796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.9557704329490662,
"learning_rate": 4.555555555555556e-06,
"loss": 0.0051,
"num_tokens": 9193693.0,
"reward": 1.1587154865264893,
"reward_std": 0.2776384949684143,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6743347644805908,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.7924422025680542,
"step": 37
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 3.91796875,
"calib/ece": 0.16411764705882348,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.07123686186186184,
"calib/mean_conf": 0.40058823529411763,
"calib/mu_c": 0.4315972222222222,
"calib/mu_w": 0.36036036036036034,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.0,
"calib/std_conf": 0.1357248046846623,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2757.0,
"completions/max_terminated_length": 2757.0,
"completions/mean_length": 414.10546875,
"completions/mean_terminated_length": 415.72943115234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.04053333333333333,
"grad_norm": 1.3092141151428223,
"learning_rate": 4.527777777777778e-06,
"loss": 0.0077,
"num_tokens": 9429632.0,
"reward": 1.2299902439117432,
"reward_std": 0.3092259168624878,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7314550876617432,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.8133012056350708,
"step": 38
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 3.94921875,
"calib/ece": 0.05313725490196089,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.04198345541803494,
"calib/mean_conf": 0.5374509803921569,
"calib/mu_c": 0.5550675675675676,
"calib/mu_w": 0.5130841121495326,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.005098039215686271,
"calib/std_conf": 0.10403323634178029,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2034.0,
"completions/max_terminated_length": 2034.0,
"completions/mean_length": 385.234375,
"completions/mean_terminated_length": 386.7451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.0416,
"grad_norm": 1.2867313623428345,
"learning_rate": 4.5e-06,
"loss": 0.0126,
"num_tokens": 9657380.0,
"reward": 1.2601454257965088,
"reward_std": 0.2809554934501648,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7519237995147705,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.8304837942123413,
"step": 39
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 3.66796875,
"calib/ece": 0.018823529411764572,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.015337787212787224,
"calib/mean_conf": 0.5733333333333333,
"calib/mu_c": 0.58006993006993,
"calib/mu_w": 0.5647321428571428,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.015686274509803786,
"calib/std_conf": 0.06769790643696237,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1494.0,
"completions/max_terminated_length": 1494.0,
"completions/mean_length": 406.4296875,
"completions/mean_terminated_length": 408.0235595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.8483911752700806,
"learning_rate": 4.472222222222223e-06,
"loss": 0.0088,
"num_tokens": 9891226.0,
"reward": 1.2255644798278809,
"reward_std": 0.35025548934936523,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7437109351158142,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.786710798740387,
"step": 40
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 3.38671875,
"calib/ece": 0.17570281124498008,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.015512934879571727,
"calib/mean_conf": 0.5889558232931728,
"calib/mu_c": 0.5926315789473684,
"calib/mu_w": 0.5771186440677967,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.0008032128514056232,
"calib/std_conf": 0.048558684491893225,
"calib/step_conf_rate": 0.984375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 1105.0,
"completions/max_terminated_length": 1105.0,
"completions/mean_length": 349.5546875,
"completions/mean_terminated_length": 357.94403076171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.04373333333333333,
"grad_norm": 2.9355227947235107,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0397,
"num_tokens": 10111000.0,
"reward": 1.4171063899993896,
"reward_std": 0.3127620220184326,
"rewards/accuracy_reward_step": 0.7421875,
"rewards/final_brier_reward_step": 0.7581347227096558,
"rewards/format_reward_step": 0.95703125,
"rewards/stepwise_brier_reward": 0.8005934953689575,
"step": 41
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 3.39453125,
"calib/ece": 0.021599999999999748,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.005966930265995818,
"calib/mean_conf": 0.5935999999999999,
"calib/mu_c": 0.5961538461538461,
"calib/mu_w": 0.5901869158878503,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.021599999999999748,
"calib/std_conf": 0.03433715189120961,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 1590.0,
"completions/max_terminated_length": 1590.0,
"completions/mean_length": 330.984375,
"completions/mean_terminated_length": 338.9280090332031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.0448,
"grad_norm": 2.04837965965271,
"learning_rate": 4.416666666666667e-06,
"loss": -0.0437,
"num_tokens": 10323140.0,
"reward": 1.219673752784729,
"reward_std": 0.2220257967710495,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7362304925918579,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.7827968597412109,
"step": 42
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 3.171875,
"calib/ece": 0.032480314960630106,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.008940748096656792,
"calib/mean_conf": 0.5982283464566929,
"calib/mu_c": 0.6015723270440252,
"calib/mu_w": 0.5926315789473684,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.002362204724409446,
"calib/std_conf": 0.03446456333122836,
"calib/step_conf_rate": 0.9765625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1652.0,
"completions/max_terminated_length": 1652.0,
"completions/mean_length": 411.0390625,
"completions/mean_terminated_length": 411.0390625,
"completions/min_length": 125.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.8822596073150635,
"learning_rate": 4.388888888888889e-06,
"loss": 0.0455,
"num_tokens": 10556630.0,
"reward": 1.2898056507110596,
"reward_std": 0.2994333803653717,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.742919921875,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.8015077114105225,
"step": 43
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 2.828125,
"calib/ece": 0.08119999999999976,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.006368772470467632,
"calib/mean_conf": 0.6075999999999999,
"calib/mu_c": 0.6106060606060606,
"calib/mu_w": 0.604237288135593,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.08039999999999976,
"calib/std_conf": 0.03636261816756323,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1850.0,
"completions/max_terminated_length": 1850.0,
"completions/mean_length": 421.0234375,
"completions/mean_terminated_length": 429.4103698730469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.9918916821479797,
"learning_rate": 4.361111111111112e-06,
"loss": -0.0099,
"num_tokens": 10793772.0,
"reward": 1.165583610534668,
"reward_std": 0.25042974948883057,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7213085889816284,
"rewards/format_reward_step": 0.96484375,
"rewards/stepwise_brier_reward": 0.7712799310684204,
"step": 44
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 3.01953125,
"calib/ece": 0.053149606299212726,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.011876693766937674,
"calib/mean_conf": 0.6137795275590551,
"calib/mu_c": 0.6179878048780487,
"calib/mu_w": 0.606111111111111,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.01062992125984252,
"calib/std_conf": 0.04099028373171769,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2232.0,
"completions/max_terminated_length": 2232.0,
"completions/mean_length": 379.125,
"completions/mean_terminated_length": 382.1102294921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.048,
"grad_norm": 1.1430001258850098,
"learning_rate": 4.333333333333334e-06,
"loss": -0.0041,
"num_tokens": 11018916.0,
"reward": 1.3244693279266357,
"reward_std": 0.2840193510055542,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7614257335662842,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.8187761306762695,
"step": 45
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 2.796875,
"calib/ece": 0.07094861660079053,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.009422492401215954,
"calib/mean_conf": 0.6266798418972331,
"calib/mu_c": 0.6308510638297873,
"calib/mu_w": 0.6214285714285713,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.07015810276679844,
"calib/std_conf": 0.054714894070779146,
"calib/step_conf_rate": 0.98046875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2988.0,
"completions/max_terminated_length": 2988.0,
"completions/mean_length": 455.6484375,
"completions/mean_terminated_length": 459.2362060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.6162858009338379,
"learning_rate": 4.305555555555556e-06,
"loss": 0.0113,
"num_tokens": 11263370.0,
"reward": 1.2110141515731812,
"reward_std": 0.29757267236709595,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7264159917831421,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.8005996346473694,
"step": 46
},
{
"calib/answer_extract_rate": 0.953125,
"calib/avg_num_step_conf": 2.74609375,
"calib/ece": 0.040778688524590156,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.010471243601324898,
"calib/mean_conf": 0.6301229508196721,
"calib/mu_c": 0.633641975308642,
"calib/mu_w": 0.6231707317073171,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.003483606557377053,
"calib/std_conf": 0.05165682438987863,
"calib/step_conf_rate": 0.984375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2842.0,
"completions/max_terminated_length": 2842.0,
"completions/mean_length": 423.41796875,
"completions/mean_terminated_length": 435.3212585449219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.7916970252990723,
"learning_rate": 4.277777777777778e-06,
"loss": -0.0065,
"num_tokens": 11500781.0,
"reward": 1.2935917377471924,
"reward_std": 0.2790271043777466,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7379980087280273,
"rewards/format_reward_step": 0.94921875,
"rewards/stepwise_brier_reward": 0.787433385848999,
"step": 47
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 2.3671875,
"calib/ece": 0.04126984126984135,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.018509803921568646,
"calib/mean_conf": 0.6365079365079366,
"calib/mu_c": 0.644,
"calib/mu_w": 0.6254901960784314,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.04126984126984135,
"calib/std_conf": 0.05248306545195376,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2133.0,
"completions/max_terminated_length": 2133.0,
"completions/mean_length": 358.8046875,
"completions/mean_terminated_length": 365.95220947265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 94.0,
"epoch": 0.0512,
"grad_norm": 0.8702788352966309,
"learning_rate": 4.25e-06,
"loss": -0.0334,
"num_tokens": 11719363.0,
"reward": 1.254289150238037,
"reward_std": 0.3980778753757477,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7466015815734863,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.7895784378051758,
"step": 48
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/avg_num_step_conf": 2.42578125,
"calib/ece": 0.07601626016260174,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.01412429378531077,
"calib/mean_conf": 0.6434959349593495,
"calib/mu_c": 0.6474576271186441,
"calib/mu_w": 0.6333333333333333,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.0,
"calib/std_conf": 0.0575443800429925,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2049.0,
"completions/max_terminated_length": 2049.0,
"completions/mean_length": 372.671875,
"completions/mean_terminated_length": 386.2510070800781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.6196070909500122,
"learning_rate": 4.222222222222223e-06,
"loss": -0.0149,
"num_tokens": 11942343.0,
"reward": 1.3646574020385742,
"reward_std": 0.31304365396499634,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.7571874856948853,
"rewards/format_reward_step": 0.953125,
"rewards/stepwise_brier_reward": 0.7973794937133789,
"step": 49
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 2.33203125,
"calib/ece": 0.06333333333333331,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.0007627504553734399,
"calib/mean_conf": 0.6543137254901961,
"calib/mu_c": 0.6540983606557377,
"calib/mu_w": 0.6548611111111111,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.0,
"calib/std_conf": 0.05531566816865047,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1421.0,
"completions/max_terminated_length": 1421.0,
"completions/mean_length": 369.453125,
"completions/mean_terminated_length": 372.3622131347656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.05333333333333334,
"grad_norm": 1.208615779876709,
"learning_rate": 4.194444444444445e-06,
"loss": 0.0162,
"num_tokens": 12165323.0,
"reward": 1.405625343322754,
"reward_std": 0.3000626266002655,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7803418040275574,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.8071305751800537,
"step": 50
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 2.48828125,
"calib/ece": 0.1295180722891566,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.052451690821256225,
"calib/mean_conf": 0.6524096385542169,
"calib/mu_c": 0.6669444444444446,
"calib/mu_w": 0.6144927536231883,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.02951807228915665,
"calib/std_conf": 0.06596683531073422,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2124.0,
"completions/max_terminated_length": 2124.0,
"completions/mean_length": 397.078125,
"completions/mean_terminated_length": 408.2409362792969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 70.0,
"epoch": 0.0544,
"grad_norm": 0.672381579875946,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0342,
"num_tokens": 12399311.0,
"reward": 1.3945393562316895,
"reward_std": 0.3444993793964386,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7891894578933716,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.7982159852981567,
"step": 51
},
{
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 2.3984375,
"calib/ece": 0.14576612903225805,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.038650217706821643,
"calib/mean_conf": 0.6502016129032258,
"calib/mu_c": 0.6584615384615385,
"calib/mu_w": 0.6198113207547169,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.004838709677419377,
"calib/std_conf": 0.06342031828674537,
"calib/step_conf_rate": 0.984375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3009.0,
"completions/max_terminated_length": 3009.0,
"completions/mean_length": 407.47265625,
"completions/mean_terminated_length": 417.25201416015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.7831748723983765,
"learning_rate": 4.138888888888889e-06,
"loss": -0.0284,
"num_tokens": 12634616.0,
"reward": 1.4558947086334229,
"reward_std": 0.2825433611869812,
"rewards/accuracy_reward_step": 0.76171875,
"rewards/final_brier_reward_step": 0.7937792539596558,
"rewards/format_reward_step": 0.96484375,
"rewards/stepwise_brier_reward": 0.8032076954841614,
"step": 52
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 2.515625,
"calib/ece": 0.048795180722891435,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.02941331923890056,
"calib/mean_conf": 0.6475903614457831,
"calib/mu_c": 0.656686046511628,
"calib/mu_w": 0.6272727272727274,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.0028112449799196767,
"calib/std_conf": 0.06642186631500918,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 1479.0,
"completions/max_terminated_length": 1479.0,
"completions/mean_length": 412.5546875,
"completions/mean_terminated_length": 422.4560241699219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 112.0,
"epoch": 0.05653333333333333,
"grad_norm": 1.552283763885498,
"learning_rate": 4.111111111111111e-06,
"loss": -0.0257,
"num_tokens": 12869094.0,
"reward": 1.3601384162902832,
"reward_std": 0.2909051179885864,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.7710058689117432,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.8219792246818542,
"step": 53
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 2.5234375,
"calib/ece": 0.1659448818897638,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.019533862876254138,
"calib/mean_conf": 0.6529527559055118,
"calib/mu_c": 0.6564903846153846,
"calib/mu_w": 0.6369565217391304,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0,
"calib/std_conf": 0.05682842782393699,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1038.0,
"completions/max_terminated_length": 1038.0,
"completions/mean_length": 361.6796875,
"completions/mean_terminated_length": 365.9683837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 89.0,
"epoch": 0.0576,
"grad_norm": 1.2853453159332275,
"learning_rate": 4.083333333333334e-06,
"loss": 0.0224,
"num_tokens": 13090956.0,
"reward": 1.529407024383545,
"reward_std": 0.27310827374458313,
"rewards/accuracy_reward_step": 0.8125,
"rewards/final_brier_reward_step": 0.820263683795929,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8302257657051086,
"step": 54
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 2.43359375,
"calib/ece": 0.15301204819277095,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.04438768870380028,
"calib/mean_conf": 0.657429718875502,
"calib/mu_c": 0.6775735294117647,
"calib/mu_w": 0.6331858407079645,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13212851405622492,
"calib/std_conf": 0.05786547965119041,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 1458.0,
"completions/max_terminated_length": 1458.0,
"completions/mean_length": 365.26953125,
"completions/mean_terminated_length": 375.53814697265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.9097878932952881,
"learning_rate": 4.055555555555556e-06,
"loss": -0.028,
"num_tokens": 13315329.0,
"reward": 1.1907155513763428,
"reward_std": 0.36655521392822266,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7376757860183716,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.7734479904174805,
"step": 55
},
{
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 2.35546875,
"calib/ece": 0.1114919354838711,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.025295360042241755,
"calib/mean_conf": 0.6719758064516128,
"calib/mu_c": 0.6830935251798562,
"calib/mu_w": 0.6577981651376145,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1114919354838711,
"calib/std_conf": 0.05149897645548441,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1846.0,
"completions/max_terminated_length": 1846.0,
"completions/mean_length": 361.5859375,
"completions/mean_terminated_length": 373.25,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.05973333333333333,
"grad_norm": 1.6511852741241455,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0633,
"num_tokens": 13537775.0,
"reward": 1.2042827606201172,
"reward_std": 0.3192186653614044,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7275683879852295,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.802619218826294,
"step": 56
},
{
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 2.1640625,
"calib/ece": 0.06592741935483865,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.02264263547386014,
"calib/mean_conf": 0.6816532258064517,
"calib/mu_c": 0.688135593220339,
"calib/mu_w": 0.6654929577464789,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.016935483870967723,
"calib/std_conf": 0.05095943110579659,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2458.0,
"completions/max_terminated_length": 2458.0,
"completions/mean_length": 342.28125,
"completions/mean_terminated_length": 350.4960021972656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 89.0,
"epoch": 0.0608,
"grad_norm": 0.83037930727005,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0082,
"num_tokens": 13755231.0,
"reward": 1.3770034313201904,
"reward_std": 0.3410765826702118,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.7762597799301147,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.8023691177368164,
"step": 57
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 2.1953125,
"calib/ece": 0.11526104417670681,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.0002533894958537397,
"calib/mean_conf": 0.6791164658634538,
"calib/mu_c": 0.6792253521126762,
"calib/mu_w": 0.6789719626168225,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.11204819277108433,
"calib/std_conf": 0.050356385416037486,
"calib/step_conf_rate": 0.98828125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 1757.0,
"completions/max_terminated_length": 1757.0,
"completions/mean_length": 397.59765625,
"completions/mean_terminated_length": 408.77508544921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.8900550603866577,
"learning_rate": 3.972222222222223e-06,
"loss": -0.0146,
"num_tokens": 13986376.0,
"reward": 1.205758810043335,
"reward_std": 0.3948570489883423,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7204296588897705,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.7743635773658752,
"step": 58
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 2.234375,
"calib/ece": 0.10098425196850404,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.013437500000000102,
"calib/mean_conf": 0.6915354330708663,
"calib/mu_c": 0.6865625,
"calib/mu_w": 0.7000000000000001,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0812992125984253,
"calib/std_conf": 0.0657872130882196,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1243.0,
"completions/max_terminated_length": 1243.0,
"completions/mean_length": 334.8984375,
"completions/mean_terminated_length": 337.5354309082031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.06293333333333333,
"grad_norm": 1.1755027770996094,
"learning_rate": 3.944444444444445e-06,
"loss": -0.025,
"num_tokens": 14201398.0,
"reward": 1.300520420074463,
"reward_std": 0.29623836278915405,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7466113567352295,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8119841814041138,
"step": 59
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 2.3359375,
"calib/ece": 0.13963414634146354,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.027178082191780972,
"calib/mean_conf": 0.733130081300813,
"calib/mu_c": 0.7441780821917808,
"calib/mu_w": 0.7169999999999999,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13963414634146354,
"calib/std_conf": 0.05512328523219692,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2096.0,
"completions/max_terminated_length": 2096.0,
"completions/mean_length": 365.56640625,
"completions/mean_terminated_length": 377.3588562011719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.064,
"grad_norm": 0.7373173236846924,
"learning_rate": 3.916666666666667e-06,
"loss": -0.0372,
"num_tokens": 14426879.0,
"reward": 1.2209932804107666,
"reward_std": 0.359197199344635,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7200488448143005,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.7782503366470337,
"step": 60
},
{
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 2.3515625,
"calib/ece": 0.0958984375,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.00390625,
"calib/gap": 0.006229542447245384,
"calib/mean_conf": 0.7716796875,
"calib/mu_c": 0.7736994219653177,
"calib/mu_w": 0.7674698795180723,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0958984375,
"calib/std_conf": 0.03682611878683857,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 811.0,
"completions/max_terminated_length": 811.0,
"completions/mean_length": 303.28125,
"completions/mean_terminated_length": 305.6692810058594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.8471395969390869,
"learning_rate": 3.88888888888889e-06,
"loss": -0.0103,
"num_tokens": 14631623.0,
"reward": 1.3641399145126343,
"reward_std": 0.2312508374452591,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7730761766433716,
"rewards/format_reward_step": 1.0,
"rewards/stepwise_brier_reward": 0.8072821497917175,
"step": 61
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 2.64453125,
"calib/ece": 0.10040485829959504,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.004048582995951417,
"calib/gap": 0.02245315698315198,
"calib/mean_conf": 0.8048582995951415,
"calib/mu_c": 0.8114942528735628,
"calib/mu_w": 0.7890410958904108,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10040485829959504,
"calib/std_conf": 0.05580366248649091,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 1762.0,
"completions/max_terminated_length": 1762.0,
"completions/mean_length": 361.671875,
"completions/mean_terminated_length": 374.8502197265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.7572021484375,
"learning_rate": 3.861111111111112e-06,
"loss": -0.0409,
"num_tokens": 14854331.0,
"reward": 1.3489103317260742,
"reward_std": 0.442284494638443,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.758544921875,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.7754268050193787,
"step": 62
},
{
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 2.84375,
"calib/ece": 0.15887096774193551,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.040039492671071764,
"calib/mean_conf": 0.8483870967741935,
"calib/mu_c": 0.8608187134502924,
"calib/mu_w": 0.8207792207792206,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15887096774193551,
"calib/std_conf": 0.056058539345105705,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1618.0,
"completions/max_terminated_length": 1618.0,
"completions/mean_length": 406.0234375,
"completions/mean_terminated_length": 419.1209411621094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.0672,
"grad_norm": 0.5545381307601929,
"learning_rate": 3.833333333333334e-06,
"loss": -0.0666,
"num_tokens": 15089953.0,
"reward": 1.3344875574111938,
"reward_std": 0.36827802658081055,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7504687309265137,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.7776375412940979,
"step": 63
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 2.96484375,
"calib/ece": 0.14575510204081638,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.036734693877551024,
"calib/gap": 0.02861982223712889,
"calib/mean_conf": 0.8722857142857144,
"calib/mu_c": 0.8801123595505618,
"calib/mu_w": 0.8514925373134329,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14575510204081638,
"calib/std_conf": 0.051194706358994514,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 1259.0,
"completions/max_terminated_length": 1259.0,
"completions/mean_length": 358.65234375,
"completions/mean_terminated_length": 373.231689453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.6435424089431763,
"learning_rate": 3.8055555555555556e-06,
"loss": -0.0634,
"num_tokens": 15308584.0,
"reward": 1.3586997985839844,
"reward_std": 0.38508233428001404,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.7549285292625427,
"rewards/format_reward_step": 0.95703125,
"rewards/stepwise_brier_reward": 0.7608799934387207,
"step": 64
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 2.9296875,
"calib/ece": 0.3197254901960785,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.19215686274509805,
"calib/gap": 0.012923718110634086,
"calib/mean_conf": 0.9001176470588237,
"calib/mu_c": 0.9055405405405408,
"calib/mu_w": 0.8926168224299067,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3197254901960785,
"calib/std_conf": 0.0455939654842756,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1264.0,
"completions/max_terminated_length": 1264.0,
"completions/mean_length": 340.43359375,
"completions/mean_terminated_length": 343.1141662597656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.9150872230529785,
"learning_rate": 3.777777777777778e-06,
"loss": -0.0012,
"num_tokens": 15523799.0,
"reward": 1.1812100410461426,
"reward_std": 0.23995336890220642,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6558824777603149,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.7021381855010986,
"step": 65
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/avg_num_step_conf": 2.9296875,
"calib/ece": 0.36089430894308955,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.21138211382113822,
"calib/gap": 0.009404484662984824,
"calib/mean_conf": 0.9015447154471546,
"calib/mu_c": 0.9058646616541354,
"calib/mu_w": 0.8964601769911505,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36089430894308955,
"calib/std_conf": 0.053746355827687775,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 1694.0,
"completions/max_terminated_length": 1694.0,
"completions/mean_length": 417.92578125,
"completions/mean_terminated_length": 434.91461181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.0704,
"grad_norm": 0.8764449954032898,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0824,
"num_tokens": 15760180.0,
"reward": 1.0901968479156494,
"reward_std": 0.353530615568161,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.5988469123840332,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.7005934715270996,
"step": 66
},
{
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 2.796875,
"calib/ece": 0.29043010752688203,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.4717741935483871,
"calib/gap": 0.020243788091889425,
"calib/mean_conf": 0.9275268817204302,
"calib/mu_c": 0.9348734177215192,
"calib/mu_w": 0.9146296296296298,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.29043010752688203,
"calib/std_conf": 0.057126331373145836,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1463.0,
"completions/max_terminated_length": 1463.0,
"completions/mean_length": 385.08984375,
"completions/mean_terminated_length": 397.5120849609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.8649067878723145,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.0527,
"num_tokens": 15986811.0,
"reward": 1.2355461120605469,
"reward_std": 0.30521702766418457,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.6689639091491699,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.7480067014694214,
"step": 67
},
{
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 2.7109375,
"calib/ece": 0.3118623481781378,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.7489878542510121,
"calib/gap": 0.011363248471056742,
"calib/mean_conf": 0.9515384615384617,
"calib/mu_c": 0.9556329113924051,
"calib/mu_w": 0.9442696629213484,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3118623481781378,
"calib/std_conf": 0.04500181663518866,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1954.0,
"completions/max_terminated_length": 1954.0,
"completions/mean_length": 389.63671875,
"completions/mean_terminated_length": 402.20562744140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 112.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.6878625750541687,
"learning_rate": 3.694444444444445e-06,
"loss": -0.0724,
"num_tokens": 16213686.0,
"reward": 1.222684383392334,
"reward_std": 0.3363970220088959,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.6517175436019897,
"rewards/format_reward_step": 0.96484375,
"rewards/stepwise_brier_reward": 0.7326153516769409,
"step": 68
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/avg_num_step_conf": 2.8359375,
"calib/ece": 0.3385216326530613,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.8489795918367347,
"calib/gap": 0.03790612244897962,
"calib/mean_conf": 0.9323355102040818,
"calib/mu_c": 0.9474979591836736,
"calib/mu_w": 0.909591836734694,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3354285714285715,
"calib/std_conf": 0.13545859313203842,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 1884.0,
"completions/max_terminated_length": 1884.0,
"completions/mean_length": 456.0078125,
"completions/mean_terminated_length": 474.5447082519531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.0736,
"grad_norm": 0.7570173144340515,
"learning_rate": 3.6666666666666666e-06,
"loss": -0.0322,
"num_tokens": 16457960.0,
"reward": 1.166851282119751,
"reward_std": 0.4555240571498871,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6214951276779175,
"rewards/format_reward_step": 0.95703125,
"rewards/stepwise_brier_reward": 0.744727373123169,
"step": 69
},
{
"calib/answer_extract_rate": 0.93359375,
"calib/avg_num_step_conf": 2.85546875,
"calib/ece": 0.3566386554621849,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.8781512605042017,
"calib/gap": 0.010691489361701945,
"calib/mean_conf": 0.9532773109243698,
"calib/mu_c": 0.9575,
"calib/mu_w": 0.9468085106382981,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.352436974789916,
"calib/std_conf": 0.06165925944350786,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 1869.0,
"completions/max_terminated_length": 1869.0,
"completions/mean_length": 465.453125,
"completions/mean_terminated_length": 496.4833679199219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.7547297477722168,
"learning_rate": 3.638888888888889e-06,
"loss": -0.091,
"num_tokens": 16707148.0,
"reward": 1.1299214363098145,
"reward_std": 0.36550194025039673,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.5958437323570251,
"rewards/format_reward_step": 0.92578125,
"rewards/stepwise_brier_reward": 0.7076860666275024,
"step": 70
},
{
"calib/answer_extract_rate": 0.953125,
"calib/avg_num_step_conf": 3.25390625,
"calib/ece": 0.3592358333333335,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.8583333333333333,
"calib/gap": 0.032312013754566826,
"calib/mean_conf": 0.9467358333333334,
"calib/mu_c": 0.9600645390070922,
"calib/mu_w": 0.9277525252525254,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3592358333333335,
"calib/std_conf": 0.08266692486098791,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2263.0,
"completions/max_terminated_length": 2263.0,
"completions/mean_length": 525.49609375,
"completions/mean_terminated_length": 551.3401489257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.654914915561676,
"learning_rate": 3.6111111111111115e-06,
"loss": -0.0712,
"num_tokens": 16969123.0,
"reward": 1.122267246246338,
"reward_std": 0.48198401927948,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5975936651229858,
"rewards/format_reward_step": 0.9375,
"rewards/stepwise_brier_reward": 0.7001317739486694,
"step": 71
},
{
"calib/answer_extract_rate": 0.94140625,
"calib/avg_num_step_conf": 3.6015625,
"calib/ece": 0.415502074688797,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.9377593360995851,
"calib/gap": 0.02130618212197144,
"calib/mean_conf": 0.9673692946058092,
"calib/mu_c": 0.9769172932330827,
"calib/mu_w": 0.9556111111111113,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.415502074688797,
"calib/std_conf": 0.04390711304391313,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 1419.0,
"completions/max_terminated_length": 1419.0,
"completions/mean_length": 488.640625,
"completions/mean_terminated_length": 519.053955078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0768,
"grad_norm": 0.5807398557662964,
"learning_rate": 3.5833333333333335e-06,
"loss": -0.0847,
"num_tokens": 17221663.0,
"reward": 1.066742181777954,
"reward_std": 0.4397001266479492,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.5541670918464661,
"rewards/format_reward_step": 0.94140625,
"rewards/stepwise_brier_reward": 0.7039468288421631,
"step": 72
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/avg_num_step_conf": 4.16796875,
"calib/ece": 0.18308298755186742,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.9045643153526971,
"calib/gap": 0.0503100000000003,
"calib/mean_conf": 0.9410622406639007,
"calib/mu_c": 0.9515000000000003,
"calib/mu_w": 0.90119,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16580705394190892,
"calib/std_conf": 0.1489545415964191,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 1762.0,
"completions/max_terminated_length": 1762.0,
"completions/mean_length": 518.40625,
"completions/mean_terminated_length": 543.901611328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.39967501163482666,
"learning_rate": 3.555555555555556e-06,
"loss": -0.0999,
"num_tokens": 17484447.0,
"reward": 1.4234883785247803,
"reward_std": 0.34371453523635864,
"rewards/accuracy_reward_step": 0.75390625,
"rewards/final_brier_reward_step": 0.7605338096618652,
"rewards/format_reward_step": 0.94140625,
"rewards/stepwise_brier_reward": 0.7806982398033142,
"step": 73
},
{
"calib/answer_extract_rate": 0.48828125,
"calib/avg_num_step_conf": 21.6328125,
"calib/ece": 0.33212000000000014,
"calib/final_conf_rate": 0.48828125,
"calib/format_rate": 0.484375,
"calib/frac_conf_gt_0.9": 0.968,
"calib/gap": 0.01060416666666708,
"calib/mean_conf": 0.9721200000000002,
"calib/mu_c": 0.9759375000000002,
"calib/mu_w": 0.9653333333333332,
"calib/nonempty_final_conf_rate": 0.48828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.33212000000000014,
"calib/std_conf": 0.027635223899943346,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.50390625,
"completions/max_length": 1721.0,
"completions/max_terminated_length": 1721.0,
"completions/mean_length": 364.89453125,
"completions/mean_terminated_length": 735.535400390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.07893333333333333,
"grad_norm": 1.1979538202285767,
"learning_rate": 3.5277777777777784e-06,
"loss": -0.6675,
"num_tokens": 17704828.0,
"reward": 0.6136009693145752,
"reward_std": 0.6024596095085144,
"rewards/accuracy_reward_step": 0.3125,
"rewards/final_brier_reward_step": 0.32385730743408203,
"rewards/format_reward_step": 0.484375,
"rewards/stepwise_brier_reward": 0.3629390001296997,
"step": 74
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 4.19921875,
"calib/ece": 0.14183673469387784,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.8734693877551021,
"calib/gap": 0.03819538670284972,
"calib/mean_conf": 0.9622448979591839,
"calib/mu_c": 0.9691044776119405,
"calib/mu_w": 0.9309090909090908,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14183673469387784,
"calib/std_conf": 0.040721317995311763,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2474.0,
"completions/max_terminated_length": 2474.0,
"completions/mean_length": 550.30078125,
"completions/mean_terminated_length": 575.0081176757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 220.0,
"epoch": 0.08,
"grad_norm": 0.5308865308761597,
"learning_rate": 3.5e-06,
"loss": -0.0702,
"num_tokens": 17973497.0,
"reward": 1.488100290298462,
"reward_std": 0.3117517828941345,
"rewards/accuracy_reward_step": 0.78515625,
"rewards/final_brier_reward_step": 0.8048710823059082,
"rewards/format_reward_step": 0.953125,
"rewards/stepwise_brier_reward": 0.8207836151123047,
"step": 75
},
{
"calib/answer_extract_rate": 0.9375,
"calib/avg_num_step_conf": 3.640625,
"calib/ece": 0.23583333333333345,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.8083333333333333,
"calib/gap": 0.046936181032291024,
"calib/mean_conf": 0.9441666666666667,
"calib/mu_c": 0.9576608187134504,
"calib/mu_w": 0.9107246376811594,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.23375000000000012,
"calib/std_conf": 0.083706663746416,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2172.0,
"completions/max_terminated_length": 2172.0,
"completions/mean_length": 521.97265625,
"completions/mean_terminated_length": 556.7708740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.4699781537055969,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.1131,
"num_tokens": 18233218.0,
"reward": 1.2967987060546875,
"reward_std": 0.34049344062805176,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7021620869636536,
"rewards/format_reward_step": 0.92578125,
"rewards/stepwise_brier_reward": 0.7406830787658691,
"step": 76
},
{
"calib/answer_extract_rate": 0.9296875,
"calib/avg_num_step_conf": 3.87890625,
"calib/ece": 0.19236092436974805,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.8151260504201681,
"calib/gap": 0.07027323970037491,
"calib/mean_conf": 0.934025630252101,
"calib/mu_c": 0.951741573033708,
"calib/mu_w": 0.8814683333333331,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18924369747899172,
"calib/std_conf": 0.11022834690195274,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2780.0,
"completions/max_terminated_length": 2780.0,
"completions/mean_length": 582.9921875,
"completions/mean_terminated_length": 624.460205078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.4831450581550598,
"learning_rate": 3.444444444444445e-06,
"loss": -0.0916,
"num_tokens": 18510168.0,
"reward": 1.3424919843673706,
"reward_std": 0.40851569175720215,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.7341254353523254,
"rewards/format_reward_step": 0.92578125,
"rewards/stepwise_brier_reward": 0.7501545548439026,
"step": 77
},
{
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 3.93359375,
"calib/ece": 0.27076612903225805,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6975806451612904,
"calib/gap": 0.05114699971289138,
"calib/mean_conf": 0.923991935483871,
"calib/mu_c": 0.9417283950617285,
"calib/mu_w": 0.8905813953488371,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27076612903225805,
"calib/std_conf": 0.08716752465407296,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1903.0,
"completions/max_terminated_length": 1903.0,
"completions/mean_length": 653.8671875,
"completions/mean_terminated_length": 674.9596557617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 248.0,
"epoch": 0.0832,
"grad_norm": 0.5532412528991699,
"learning_rate": 3.416666666666667e-06,
"loss": -0.0544,
"num_tokens": 18808622.0,
"reward": 1.2611587047576904,
"reward_std": 0.2908684015274048,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6933706998825073,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.739143431186676,
"step": 78
},
{
"calib/answer_extract_rate": 0.93359375,
"calib/avg_num_step_conf": 3.890625,
"calib/ece": 0.24228991596638672,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.8613445378151261,
"calib/gap": 0.010500000000000287,
"calib/mean_conf": 0.9483823529411766,
"calib/mu_c": 0.9513823529411766,
"calib/mu_w": 0.9408823529411763,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.23819327731092452,
"calib/std_conf": 0.08040606518685899,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2889.0,
"completions/max_terminated_length": 2889.0,
"completions/mean_length": 601.4453125,
"completions/mean_terminated_length": 644.2259521484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 239.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.47690510749816895,
"learning_rate": 3.3888888888888893e-06,
"loss": -0.0922,
"num_tokens": 19092008.0,
"reward": 1.2916498184204102,
"reward_std": 0.2730216085910797,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.6869811415672302,
"rewards/format_reward_step": 0.9296875,
"rewards/stepwise_brier_reward": 0.7488871812820435,
"step": 79
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/avg_num_step_conf": 3.9375,
"calib/ece": 0.20814814814814825,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.9382716049382716,
"calib/gap": 0.031239867354458717,
"calib/mean_conf": 0.9653497942386833,
"calib/mu_c": 0.9729347826086958,
"calib/mu_w": 0.9416949152542371,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20814814814814825,
"calib/std_conf": 0.04611870832058621,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 1914.0,
"completions/max_terminated_length": 1914.0,
"completions/mean_length": 533.3125,
"completions/mean_terminated_length": 561.8436279296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.35896626114845276,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0762,
"num_tokens": 19353736.0,
"reward": 1.3742382526397705,
"reward_std": 0.36067700386047363,
"rewards/accuracy_reward_step": 0.71875,
"rewards/final_brier_reward_step": 0.7424664497375488,
"rewards/format_reward_step": 0.94921875,
"rewards/stepwise_brier_reward": 0.7573330402374268,
"step": 80
},
{
"calib/answer_extract_rate": 0.921875,
"calib/avg_num_step_conf": 4.140625,
"calib/ece": 0.2328389830508477,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.9449152542372882,
"calib/gap": 0.020306323185012243,
"calib/mean_conf": 0.9680084745762714,
"calib/mu_c": 0.9732571428571432,
"calib/mu_w": 0.9529508196721309,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22966101694915275,
"calib/std_conf": 0.04510646459788818,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2588.0,
"completions/max_terminated_length": 2588.0,
"completions/mean_length": 591.33984375,
"completions/mean_terminated_length": 641.453369140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.0864,
"grad_norm": 0.40946197509765625,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.1225,
"num_tokens": 19634407.0,
"reward": 1.3076531887054443,
"reward_std": 0.453918993473053,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.6992918252944946,
"rewards/format_reward_step": 0.91796875,
"rewards/stepwise_brier_reward": 0.7304664850234985,
"step": 81
},
{
"calib/answer_extract_rate": 0.9296875,
"calib/avg_num_step_conf": 4.56640625,
"calib/ece": 0.2556302521008406,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.9495798319327731,
"calib/gap": 0.012528585144453652,
"calib/mean_conf": 0.9697478991596642,
"calib/mu_c": 0.97327485380117,
"calib/mu_w": 0.9607462686567163,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2534453781512608,
"calib/std_conf": 0.03829259719303655,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 1848.0,
"completions/max_terminated_length": 1848.0,
"completions/mean_length": 566.63671875,
"completions/mean_terminated_length": 609.4916381835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 250.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.42007526755332947,
"learning_rate": 3.3055555555555558e-06,
"loss": -0.1233,
"num_tokens": 19908058.0,
"reward": 1.2855520248413086,
"reward_std": 0.3105565309524536,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.686301589012146,
"rewards/format_reward_step": 0.9296875,
"rewards/stepwise_brier_reward": 0.7258551120758057,
"step": 82
},
{
"calib/answer_extract_rate": 0.88671875,
"calib/avg_num_step_conf": 4.91015625,
"calib/ece": 0.30422907488986817,
"calib/final_conf_rate": 0.88671875,
"calib/format_rate": 0.8828125,
"calib/frac_conf_gt_0.9": 0.9383259911894273,
"calib/gap": 0.011205123736493627,
"calib/mean_conf": 0.9694273127753307,
"calib/mu_c": 0.9731788079470202,
"calib/mu_w": 0.9619736842105265,
"calib/nonempty_final_conf_rate": 0.88671875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.30422907488986817,
"calib/std_conf": 0.03488341505762858,
"calib/step_conf_rate": 0.9921875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2880.0,
"completions/max_terminated_length": 2880.0,
"completions/mean_length": 650.6796875,
"completions/mean_terminated_length": 730.5877075195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 301.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.4519554376602173,
"learning_rate": 3.277777777777778e-06,
"loss": -0.1525,
"num_tokens": 20204936.0,
"reward": 1.1441466808319092,
"reward_std": 0.35974669456481934,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6066093444824219,
"rewards/format_reward_step": 0.8828125,
"rewards/stepwise_brier_reward": 0.6508677005767822,
"step": 83
},
{
"calib/answer_extract_rate": 0.93359375,
"calib/avg_num_step_conf": 6.29296875,
"calib/ece": 0.25924769874476994,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.9372384937238494,
"calib/gap": 0.03571067348678636,
"calib/mean_conf": 0.9538066945606696,
"calib/mu_c": 0.9641164705882355,
"calib/mu_w": 0.9284057971014491,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25087866108786616,
"calib/std_conf": 0.11030594026286576,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 1956.0,
"completions/max_terminated_length": 1956.0,
"completions/mean_length": 640.3828125,
"completions/mean_terminated_length": 685.9330444335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.0896,
"grad_norm": 0.6918340921401978,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.1044,
"num_tokens": 20497834.0,
"reward": 1.2782249450683594,
"reward_std": 0.37417155504226685,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.6854082345962524,
"rewards/format_reward_step": 0.9296875,
"rewards/stepwise_brier_reward": 0.7139586210250854,
"step": 84
},
{
"calib/answer_extract_rate": 0.83984375,
"calib/avg_num_step_conf": 8.09375,
"calib/ece": 0.3080944186046515,
"calib/final_conf_rate": 0.83984375,
"calib/format_rate": 0.83984375,
"calib/frac_conf_gt_0.9": 0.8744186046511628,
"calib/gap": 0.064403890571753,
"calib/mean_conf": 0.9185102325581397,
"calib/mu_c": 0.9412762589928058,
"calib/mu_w": 0.8768723684210528,
"calib/nonempty_final_conf_rate": 0.83984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2900465116279073,
"calib/std_conf": 0.1921334642412437,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2560.0,
"completions/max_terminated_length": 2560.0,
"completions/mean_length": 711.48046875,
"completions/mean_terminated_length": 843.2361450195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 204.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.534420907497406,
"learning_rate": 3.2222222222222227e-06,
"loss": -0.243,
"num_tokens": 20810837.0,
"reward": 1.078109860420227,
"reward_std": 0.4689858555793762,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5794956684112549,
"rewards/format_reward_step": 0.83984375,
"rewards/stepwise_brier_reward": 0.6456356048583984,
"step": 85
},
{
"calib/answer_extract_rate": 0.69921875,
"calib/avg_num_step_conf": 14.1796875,
"calib/ece": 0.3012921348314609,
"calib/final_conf_rate": 0.6953125,
"calib/format_rate": 0.68359375,
"calib/frac_conf_gt_0.9": 0.8707865168539326,
"calib/gap": 0.03486478912708468,
"calib/mean_conf": 0.9527528089887642,
"calib/mu_c": 0.9647008547008551,
"calib/mu_w": 0.9298360655737704,
"calib/nonempty_final_conf_rate": 0.6953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.29837078651685417,
"calib/std_conf": 0.06700457487296918,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.29296875,
"completions/max_length": 2777.0,
"completions/max_terminated_length": 2777.0,
"completions/mean_length": 667.6796875,
"completions/mean_terminated_length": 944.3425903320312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 328.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.47510606050491333,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.3591,
"num_tokens": 21110315.0,
"reward": 0.8938664793968201,
"reward_std": 0.6086052656173706,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.4769015610218048,
"rewards/format_reward_step": 0.68359375,
"rewards/stepwise_brier_reward": 0.5044753551483154,
"step": 86
},
{
"calib/answer_extract_rate": 0.45703125,
"calib/avg_num_step_conf": 26.7734375,
"calib/ece": 0.19243589743589756,
"calib/final_conf_rate": 0.45703125,
"calib/format_rate": 0.45703125,
"calib/frac_conf_gt_0.9": 0.8803418803418803,
"calib/gap": -0.02857770582793684,
"calib/mean_conf": 0.9426923076923079,
"calib/mu_c": 0.9370744680851066,
"calib/mu_w": 0.9656521739130435,
"calib/nonempty_final_conf_rate": 0.45703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16585470085470097,
"calib/std_conf": 0.11494989332320599,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5234375,
"completions/max_length": 2528.0,
"completions/max_terminated_length": 2528.0,
"completions/mean_length": 482.16796875,
"completions/mean_terminated_length": 1011.7622680664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 417.0,
"epoch": 0.0928,
"grad_norm": 0.7267798185348511,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.5939,
"num_tokens": 21362286.0,
"reward": 0.6898885369300842,
"reward_std": 0.7735934853553772,
"rewards/accuracy_reward_step": 0.3671875,
"rewards/final_brier_reward_step": 0.36581942439079285,
"rewards/format_reward_step": 0.45703125,
"rewards/stepwise_brier_reward": 0.3763526976108551,
"step": 87
},
{
"calib/answer_extract_rate": 0.69140625,
"calib/avg_num_step_conf": 19.7890625,
"calib/ece": 0.2280594444444446,
"calib/final_conf_rate": 0.703125,
"calib/format_rate": 0.6875,
"calib/frac_conf_gt_0.9": 0.8333333333333334,
"calib/gap": 0.11028449519230799,
"calib/mean_conf": 0.9286072222222223,
"calib/mu_c": 0.9604671875000002,
"calib/mu_w": 0.8501826923076922,
"calib/nonempty_final_conf_rate": 0.703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2227777777777779,
"calib/std_conf": 0.14219734175924337,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.296875,
"completions/max_length": 2350.0,
"completions/max_terminated_length": 2350.0,
"completions/mean_length": 742.12109375,
"completions/mean_terminated_length": 1055.461181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 539.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.6653816103935242,
"learning_rate": 3.138888888888889e-06,
"loss": -0.3913,
"num_tokens": 21685157.0,
"reward": 0.9765293002128601,
"reward_std": 0.737666666507721,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.5399715900421143,
"rewards/format_reward_step": 0.6875,
"rewards/stepwise_brier_reward": 0.5511740446090698,
"step": 88
},
{
"calib/answer_extract_rate": 0.8046875,
"calib/avg_num_step_conf": 11.9375,
"calib/ece": 0.4115776699029128,
"calib/final_conf_rate": 0.8046875,
"calib/format_rate": 0.796875,
"calib/frac_conf_gt_0.9": 0.7330097087378641,
"calib/gap": 0.04470332577475444,
"calib/mean_conf": 0.9261407766990293,
"calib/mu_c": 0.9474074074074077,
"calib/mu_w": 0.9027040816326533,
"calib/nonempty_final_conf_rate": 0.8046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.40672330097087395,
"calib/std_conf": 0.10845465513640232,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1953125,
"completions/max_length": 2870.0,
"completions/max_terminated_length": 2870.0,
"completions/mean_length": 854.83203125,
"completions/mean_terminated_length": 1062.3155517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 522.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.49867215752601624,
"learning_rate": 3.1111111111111116e-06,
"loss": -0.2103,
"num_tokens": 22035922.0,
"reward": 0.8709223866462708,
"reward_std": 0.5263679027557373,
"rewards/accuracy_reward_step": 0.421875,
"rewards/final_brier_reward_step": 0.4772096872329712,
"rewards/format_reward_step": 0.796875,
"rewards/stepwise_brier_reward": 0.5230201482772827,
"step": 89
},
{
"calib/answer_extract_rate": 0.90625,
"calib/avg_num_step_conf": 9.1328125,
"calib/ece": 0.28369098712446356,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.8369098712446352,
"calib/gap": 0.04560380479735304,
"calib/mean_conf": 0.9489270386266095,
"calib/mu_c": 0.9641935483870969,
"calib/mu_w": 0.9185897435897439,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.28369098712446356,
"calib/std_conf": 0.06361875987449349,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 2248.0,
"completions/max_terminated_length": 2248.0,
"completions/mean_length": 852.30859375,
"completions/mean_terminated_length": 932.4402465820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 495.0,
"epoch": 0.096,
"grad_norm": 0.40822866559028625,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.0858,
"num_tokens": 22380473.0,
"reward": 1.1848654747009277,
"reward_std": 0.36280542612075806,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.6330581903457642,
"rewards/format_reward_step": 0.890625,
"rewards/stepwise_brier_reward": 0.6952204704284668,
"step": 90
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 8.44140625,
"calib/ece": 0.20299595141700405,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6639676113360324,
"calib/gap": 0.08791293213828433,
"calib/mean_conf": 0.9110931174089069,
"calib/mu_c": 0.9363636363636364,
"calib/mu_w": 0.8484507042253521,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20076923076923076,
"calib/std_conf": 0.10060555232743204,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2042.0,
"completions/max_terminated_length": 2042.0,
"completions/mean_length": 860.7890625,
"completions/mean_terminated_length": 892.1538696289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 470.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.7437296509742737,
"learning_rate": 3.055555555555556e-06,
"loss": -0.0689,
"num_tokens": 22731587.0,
"reward": 1.355747938156128,
"reward_std": 0.3515569865703583,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.750420331954956,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.787775993347168,
"step": 91
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/avg_num_step_conf": 8.22265625,
"calib/ece": 0.1214344262295082,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.5860655737704918,
"calib/gap": 0.138848484848485,
"calib/mean_conf": 0.8853688524590165,
"calib/mu_c": 0.9166666666666666,
"calib/mu_w": 0.7778181818181816,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11610655737704918,
"calib/std_conf": 0.13771952376679206,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2596.0,
"completions/max_terminated_length": 2596.0,
"completions/mean_length": 843.82421875,
"completions/mean_terminated_length": 885.32373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 412.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.5318933129310608,
"learning_rate": 3.0277777777777776e-06,
"loss": -0.0645,
"num_tokens": 23077366.0,
"reward": 1.4378492832183838,
"reward_std": 0.3149486482143402,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.8006671667098999,
"rewards/format_reward_step": 0.9453125,
"rewards/stepwise_brier_reward": 0.8188130259513855,
"step": 92
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/avg_num_step_conf": 8.53125,
"calib/ece": 0.174672131147541,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.5942622950819673,
"calib/gap": 0.07115423901940765,
"calib/mean_conf": 0.9041803278688525,
"calib/mu_c": 0.923426966292135,
"calib/mu_w": 0.8522727272727274,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.174672131147541,
"calib/std_conf": 0.09896878265835195,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2549.0,
"completions/max_terminated_length": 2549.0,
"completions/mean_length": 825.390625,
"completions/mean_terminated_length": 862.4489135742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 467.0,
"epoch": 0.0992,
"grad_norm": 0.45298802852630615,
"learning_rate": 3e-06,
"loss": -0.0699,
"num_tokens": 23417482.0,
"reward": 1.3614253997802734,
"reward_std": 0.35973337292671204,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.7525163888931274,
"rewards/format_reward_step": 0.94921875,
"rewards/stepwise_brier_reward": 0.7797309160232544,
"step": 93
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/avg_num_step_conf": 8.78515625,
"calib/ece": 0.23654320987654331,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.6337448559670782,
"calib/gap": 0.10949923312883447,
"calib/mean_conf": 0.9073251028806585,
"calib/mu_c": 0.9433742331288344,
"calib/mu_w": 0.8338749999999999,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23654320987654331,
"calib/std_conf": 0.10313125036546462,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2435.0,
"completions/max_terminated_length": 2435.0,
"completions/mean_length": 847.62890625,
"completions/mean_terminated_length": 892.9752807617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 486.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.4858059883117676,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.0611,
"num_tokens": 23766195.0,
"reward": 1.279847264289856,
"reward_std": 0.2897565960884094,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7145347595214844,
"rewards/format_reward_step": 0.94140625,
"rewards/stepwise_brier_reward": 0.7668822407722473,
"step": 94
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 9.16015625,
"calib/ece": 0.22775100401606432,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6305220883534136,
"calib/gap": 0.11558441558441579,
"calib/mean_conf": 0.8904016064257029,
"calib/mu_c": 0.9293939393939395,
"calib/mu_w": 0.8138095238095238,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22775100401606432,
"calib/std_conf": 0.13193584663745286,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 3007.0,
"completions/max_terminated_length": 3007.0,
"completions/mean_length": 870.45703125,
"completions/mean_terminated_length": 894.9276733398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 559.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.44509634375572205,
"learning_rate": 2.944444444444445e-06,
"loss": -0.0252,
"num_tokens": 24118200.0,
"reward": 1.3045127391815186,
"reward_std": 0.35857391357421875,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7381042838096619,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.774654746055603,
"step": 95
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 8.953125,
"calib/ece": 0.1099601593625499,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7808764940239044,
"calib/gap": 0.15298029556650283,
"calib/mean_conf": 0.9187250996015938,
"calib/mu_c": 0.9479802955665028,
"calib/mu_w": 0.7949999999999999,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1099601593625499,
"calib/std_conf": 0.12109053002462027,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2716.0,
"completions/max_terminated_length": 2716.0,
"completions/mean_length": 864.4453125,
"completions/mean_terminated_length": 881.6653442382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 452.0,
"epoch": 0.1024,
"grad_norm": 0.5172838568687439,
"learning_rate": 2.916666666666667e-06,
"loss": -0.0033,
"num_tokens": 24468354.0,
"reward": 1.5302934646606445,
"reward_std": 0.2607869803905487,
"rewards/accuracy_reward_step": 0.79296875,
"rewards/final_brier_reward_step": 0.8489906191825867,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.8591302633285522,
"step": 96
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 8.375,
"calib/ece": 0.2555465587044535,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.7692307692307693,
"calib/gap": 0.03263424066636922,
"calib/mean_conf": 0.9276113360323887,
"calib/mu_c": 0.9383132530120483,
"calib/mu_w": 0.905679012345679,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2555465587044535,
"calib/std_conf": 0.09808470866513047,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2167.0,
"completions/max_terminated_length": 2167.0,
"completions/mean_length": 834.4140625,
"completions/mean_terminated_length": 861.3306274414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 428.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.4137607514858246,
"learning_rate": 2.888888888888889e-06,
"loss": -0.0363,
"num_tokens": 24810076.0,
"reward": 1.275789737701416,
"reward_std": 0.3736746907234192,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.6937867403030396,
"rewards/format_reward_step": 0.96484375,
"rewards/stepwise_brier_reward": 0.7358976602554321,
"step": 97
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 8.05859375,
"calib/ece": 0.3240408163265308,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.8448979591836735,
"calib/gap": 0.058419754825982606,
"calib/mean_conf": 0.9403673469387757,
"calib/mu_c": 0.9627814569536424,
"calib/mu_w": 0.9043617021276598,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3240408163265308,
"calib/std_conf": 0.1038300601114975,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2224.0,
"completions/max_terminated_length": 2224.0,
"completions/mean_length": 822.93359375,
"completions/mean_terminated_length": 856.3861694335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 493.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.44194352626800537,
"learning_rate": 2.861111111111111e-06,
"loss": -0.0363,
"num_tokens": 25149971.0,
"reward": 1.183180332183838,
"reward_std": 0.3613991141319275,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6462027430534363,
"rewards/format_reward_step": 0.953125,
"rewards/stepwise_brier_reward": 0.6996912360191345,
"step": 98
},
{
"calib/answer_extract_rate": 0.91796875,
"calib/avg_num_step_conf": 7.71875,
"calib/ece": 0.4068510638297875,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.7404255319148936,
"calib/gap": 0.07175891046073613,
"calib/mean_conf": 0.9132340425531916,
"calib/mu_c": 0.9486554621848742,
"calib/mu_w": 0.876896551724138,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4068510638297875,
"calib/std_conf": 0.12783203953026123,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2543.0,
"completions/max_terminated_length": 2543.0,
"completions/mean_length": 853.5234375,
"completions/mean_terminated_length": 925.85595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 392.0,
"epoch": 0.1056,
"grad_norm": 0.5108424425125122,
"learning_rate": 2.8333333333333335e-06,
"loss": -0.1032,
"num_tokens": 25497313.0,
"reward": 0.9885514974594116,
"reward_std": 0.41032248735427856,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.5544949769973755,
"rewards/format_reward_step": 0.91796875,
"rewards/stepwise_brier_reward": 0.6186538338661194,
"step": 99
},
{
"calib/answer_extract_rate": 0.953125,
"calib/avg_num_step_conf": 8.37109375,
"calib/ece": 0.2564754098360657,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.7663934426229508,
"calib/gap": 0.07421036585365859,
"calib/mean_conf": 0.9097540983606558,
"calib/mu_c": 0.9340853658536585,
"calib/mu_w": 0.859875,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.247049180327869,
"calib/std_conf": 0.15833644980124587,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2507.0,
"completions/max_terminated_length": 2507.0,
"completions/mean_length": 897.53515625,
"completions/mean_terminated_length": 937.8326416015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 505.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.47045427560806274,
"learning_rate": 2.805555555555556e-06,
"loss": -0.0633,
"num_tokens": 25857530.0,
"reward": 1.2721636295318604,
"reward_std": 0.27989959716796875,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6965453028678894,
"rewards/format_reward_step": 0.953125,
"rewards/stepwise_brier_reward": 0.7518138885498047,
"step": 100
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 8.61328125,
"calib/ece": 0.30084337349397616,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8674698795180723,
"calib/gap": 0.05504682843763298,
"calib/mean_conf": 0.9514457831325303,
"calib/mu_c": 0.9706790123456792,
"calib/mu_w": 0.9156321839080462,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30084337349397616,
"calib/std_conf": 0.07664866694775996,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 1815.0,
"completions/max_terminated_length": 1815.0,
"completions/mean_length": 904.30859375,
"completions/mean_terminated_length": 929.7308959960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 516.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.6130152344703674,
"learning_rate": 2.7777777777777783e-06,
"loss": -0.035,
"num_tokens": 26219065.0,
"reward": 1.2515060901641846,
"reward_std": 0.4860761761665344,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.682148814201355,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.7214141488075256,
"step": 101
},
{
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 9.16015625,
"calib/ece": 0.26318548387096796,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9233870967741935,
"calib/gap": 0.037605394990366414,
"calib/mean_conf": 0.9607661290322583,
"calib/mu_c": 0.9721387283236997,
"calib/mu_w": 0.9345333333333333,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.26318548387096796,
"calib/std_conf": 0.07202022664714358,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1636.0,
"completions/max_terminated_length": 1636.0,
"completions/mean_length": 842.8203125,
"completions/mean_terminated_length": 870.008056640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 458.0,
"epoch": 0.1088,
"grad_norm": 0.40492483973503113,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0512,
"num_tokens": 26564563.0,
"reward": 1.3133814334869385,
"reward_std": 0.3594001531600952,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.707624614238739,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.7476516366004944,
"step": 102
},
{
"calib/answer_extract_rate": 0.9375,
"calib/avg_num_step_conf": 9.2578125,
"calib/ece": 0.364291666666667,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.9083333333333333,
"calib/gap": 0.04623386922356032,
"calib/mean_conf": 0.9601250000000002,
"calib/mu_c": 0.9788111888111892,
"calib/mu_w": 0.9325773195876289,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.364291666666667,
"calib/std_conf": 0.062898603919324,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2391.0,
"completions/max_terminated_length": 2391.0,
"completions/mean_length": 978.015625,
"completions/mean_terminated_length": 1038.8880615234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 601.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.36557236313819885,
"learning_rate": 2.7222222222222224e-06,
"loss": -0.09,
"num_tokens": 26942527.0,
"reward": 1.1214299201965332,
"reward_std": 0.4106621742248535,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6044878959655762,
"rewards/format_reward_step": 0.9375,
"rewards/stepwise_brier_reward": 0.6673686504364014,
"step": 103
},
{
"calib/answer_extract_rate": 0.9375,
"calib/avg_num_step_conf": 10.42578125,
"calib/ece": 0.3468333333333336,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.8458333333333333,
"calib/gap": 0.06278422608319545,
"calib/mean_conf": 0.9426666666666669,
"calib/mu_c": 0.9680419580419585,
"calib/mu_w": 0.905257731958763,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3468333333333336,
"calib/std_conf": 0.1027880775619862,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2647.0,
"completions/max_terminated_length": 2647.0,
"completions/mean_length": 965.890625,
"completions/mean_terminated_length": 1013.3933715820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 562.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.43601280450820923,
"learning_rate": 2.6944444444444444e-06,
"loss": -0.0696,
"num_tokens": 27319515.0,
"reward": 1.1289482116699219,
"reward_std": 0.4071006178855896,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6174039244651794,
"rewards/format_reward_step": 0.9375,
"rewards/stepwise_brier_reward": 0.6716098785400391,
"step": 104
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 11.06640625,
"calib/ece": 0.30672653061224525,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.8571428571428571,
"calib/gap": 0.013188949111713533,
"calib/mean_conf": 0.9383346938775512,
"calib/mu_c": 0.9426951219512197,
"calib/mu_w": 0.9295061728395062,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2878367346938779,
"calib/std_conf": 0.1234164106956118,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2276.0,
"completions/max_terminated_length": 2276.0,
"completions/mean_length": 984.0234375,
"completions/mean_terminated_length": 1019.8785400390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 673.0,
"epoch": 0.112,
"grad_norm": 0.40076950192451477,
"learning_rate": 2.666666666666667e-06,
"loss": -0.034,
"num_tokens": 27700225.0,
"reward": 1.2418036460876465,
"reward_std": 0.3676682710647583,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6670179963111877,
"rewards/format_reward_step": 0.95703125,
"rewards/stepwise_brier_reward": 0.6878659725189209,
"step": 105
},
{
"calib/answer_extract_rate": 0.9140625,
"calib/avg_num_step_conf": 10.40234375,
"calib/ece": 0.280726495726496,
"calib/final_conf_rate": 0.9140625,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 0.8717948717948718,
"calib/gap": 0.04368254497002011,
"calib/mean_conf": 0.9525213675213677,
"calib/mu_c": 0.9667088607594939,
"calib/mu_w": 0.9230263157894738,
"calib/nonempty_final_conf_rate": 0.9140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2790170940170943,
"calib/std_conf": 0.08555531007418866,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2667.0,
"completions/max_terminated_length": 2667.0,
"completions/mean_length": 905.91015625,
"completions/mean_terminated_length": 978.5358276367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 619.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.37359946966171265,
"learning_rate": 2.6388888888888893e-06,
"loss": -0.0989,
"num_tokens": 28059762.0,
"reward": 1.206790566444397,
"reward_std": 0.32431334257125854,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.6541393995285034,
"rewards/format_reward_step": 0.9140625,
"rewards/stepwise_brier_reward": 0.6845083832740784,
"step": 106
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 10.4453125,
"calib/ece": 0.31365461847389586,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9156626506024096,
"calib/gap": 0.030393784589186734,
"calib/mean_conf": 0.9642570281124498,
"calib/mu_c": 0.9748765432098767,
"calib/mu_w": 0.94448275862069,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31365461847389586,
"calib/std_conf": 0.056457950738666114,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 1644.0,
"completions/max_terminated_length": 1644.0,
"completions/mean_length": 893.71484375,
"completions/mean_terminated_length": 918.8392944335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 525.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.31740349531173706,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.0245,
"num_tokens": 28416209.0,
"reward": 1.2470124959945679,
"reward_std": 0.4013622999191284,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6662039160728455,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.7353296875953674,
"step": 107
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 9.640625,
"calib/ece": 0.24119521912350617,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9083665338645418,
"calib/gap": 0.03851799687010993,
"calib/mean_conf": 0.9583266932270917,
"calib/mu_c": 0.9692222222222225,
"calib/mu_w": 0.9307042253521126,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24119521912350617,
"calib/std_conf": 0.07658282528158045,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1755.0,
"completions/max_terminated_length": 1755.0,
"completions/mean_length": 881.37109375,
"completions/mean_terminated_length": 898.9282836914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 484.0,
"epoch": 0.1152,
"grad_norm": 0.35262876749038696,
"learning_rate": 2.5833333333333337e-06,
"loss": -0.0307,
"num_tokens": 28768112.0,
"reward": 1.3585948944091797,
"reward_std": 0.3439452350139618,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7341094017028809,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.7614732980728149,
"step": 108
},
{
"calib/answer_extract_rate": 0.93359375,
"calib/avg_num_step_conf": 8.421875,
"calib/ece": 0.34769874476987467,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.7949790794979079,
"calib/gap": 0.08079352517985616,
"calib/mean_conf": 0.9292887029288707,
"calib/mu_c": 0.9630935251798564,
"calib/mu_w": 0.8823000000000002,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34769874476987467,
"calib/std_conf": 0.11007755120020576,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2230.0,
"completions/max_terminated_length": 2230.0,
"completions/mean_length": 827.61328125,
"completions/mean_terminated_length": 882.7875366210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 535.0,
"epoch": 0.11626666666666667,
"grad_norm": 1.1841824054718018,
"learning_rate": 2.5555555555555557e-06,
"loss": -0.0787,
"num_tokens": 29107621.0,
"reward": 1.1153641939163208,
"reward_std": 0.3671267330646515,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6189414262771606,
"rewards/format_reward_step": 0.93359375,
"rewards/stepwise_brier_reward": 0.6782614588737488,
"step": 109
},
{
"calib/answer_extract_rate": 0.9296875,
"calib/avg_num_step_conf": 8.17578125,
"calib/ece": 0.23210084033613476,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.9033613445378151,
"calib/gap": 0.025639367816092107,
"calib/mean_conf": 0.95436974789916,
"calib/mu_c": 0.9612643678160923,
"calib/mu_w": 0.9356250000000002,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22768907563025242,
"calib/std_conf": 0.08855173349505058,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 1379.0,
"completions/max_terminated_length": 1379.0,
"completions/mean_length": 713.41015625,
"completions/mean_terminated_length": 764.15478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 311.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.5043585896492004,
"learning_rate": 2.5277777777777778e-06,
"loss": -0.0858,
"num_tokens": 29418214.0,
"reward": 1.3116428852081299,
"reward_std": 0.4207128584384918,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7026492357254028,
"rewards/format_reward_step": 0.9296875,
"rewards/stepwise_brier_reward": 0.7506482005119324,
"step": 110
},
{
"calib/answer_extract_rate": 0.921875,
"calib/avg_num_step_conf": 7.4375,
"calib/ece": 0.22889830508474598,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.8601694915254238,
"calib/gap": 0.11674563915647018,
"calib/mean_conf": 0.936525423728814,
"calib/mu_c": 0.970658682634731,
"calib/mu_w": 0.8539130434782608,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22889830508474598,
"calib/std_conf": 0.12869868316556027,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2474.0,
"completions/max_terminated_length": 2474.0,
"completions/mean_length": 727.78125,
"completions/mean_terminated_length": 782.8235473632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 420.0,
"epoch": 0.1184,
"grad_norm": 0.3411725163459778,
"learning_rate": 2.5e-06,
"loss": -0.1006,
"num_tokens": 29734974.0,
"reward": 1.2854728698730469,
"reward_std": 0.3727434575557709,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.712110161781311,
"rewards/format_reward_step": 0.921875,
"rewards/stepwise_brier_reward": 0.7395458221435547,
"step": 111
},
{
"calib/answer_extract_rate": 0.90625,
"calib/avg_num_step_conf": 6.8046875,
"calib/ece": 0.25693965517241396,
"calib/final_conf_rate": 0.90625,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.7327586206896551,
"calib/gap": 0.10153679653679648,
"calib/mean_conf": 0.9207327586206897,
"calib/mu_c": 0.95487012987013,
"calib/mu_w": 0.8533333333333335,
"calib/nonempty_final_conf_rate": 0.90625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25693965517241396,
"calib/std_conf": 0.11295741169975192,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 1881.0,
"completions/max_terminated_length": 1881.0,
"completions/mean_length": 734.1171875,
"completions/mean_terminated_length": 810.0603637695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 463.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.35230952501296997,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.1798,
"num_tokens": 30053868.0,
"reward": 1.2051305770874023,
"reward_std": 0.3573354482650757,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6736800670623779,
"rewards/format_reward_step": 0.90625,
"rewards/stepwise_brier_reward": 0.7044122815132141,
"step": 112
},
{
"calib/answer_extract_rate": 0.8828125,
"calib/avg_num_step_conf": 6.85546875,
"calib/ece": 0.2870353982300886,
"calib/final_conf_rate": 0.8828125,
"calib/format_rate": 0.87890625,
"calib/frac_conf_gt_0.9": 0.7610619469026548,
"calib/gap": 0.08733844189016637,
"calib/mean_conf": 0.928628318584071,
"calib/mu_c": 0.9599310344827588,
"calib/mu_w": 0.8725925925925925,
"calib/nonempty_final_conf_rate": 0.8828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2870353982300886,
"calib/std_conf": 0.10107113894312923,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1171875,
"completions/max_length": 1499.0,
"completions/max_terminated_length": 1499.0,
"completions/mean_length": 658.3359375,
"completions/mean_terminated_length": 745.7256469726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 434.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.39759573340415955,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.0975,
"num_tokens": 30350642.0,
"reward": 1.137017846107483,
"reward_std": 0.3805588483810425,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6296113133430481,
"rewards/format_reward_step": 0.87890625,
"rewards/stepwise_brier_reward": 0.6716611385345459,
"step": 113
},
{
"calib/answer_extract_rate": 0.90234375,
"calib/avg_num_step_conf": 6.89453125,
"calib/ece": 0.17225108225108265,
"calib/final_conf_rate": 0.90234375,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 0.8658008658008658,
"calib/gap": 0.06126077348066328,
"calib/mean_conf": 0.9558008658008661,
"calib/mu_c": 0.9690607734806633,
"calib/mu_w": 0.9078,
"calib/nonempty_final_conf_rate": 0.90234375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17225108225108265,
"calib/std_conf": 0.07070537736877427,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 2000.0,
"completions/max_terminated_length": 2000.0,
"completions/mean_length": 685.60546875,
"completions/mean_terminated_length": 759.80517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 366.0,
"epoch": 0.1216,
"grad_norm": 0.6093668937683105,
"learning_rate": 2.4166666666666667e-06,
"loss": -0.1186,
"num_tokens": 30654221.0,
"reward": 1.3461461067199707,
"reward_std": 0.2577363848686218,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.7367730140686035,
"rewards/format_reward_step": 0.90234375,
"rewards/stepwise_brier_reward": 0.7219762802124023,
"step": 114
},
{
"calib/answer_extract_rate": 0.90234375,
"calib/avg_num_step_conf": 6.59765625,
"calib/ece": 0.334675324675325,
"calib/final_conf_rate": 0.90234375,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 0.8961038961038961,
"calib/gap": 0.0193078162771958,
"calib/mean_conf": 0.9609090909090913,
"calib/mu_c": 0.9680136986301373,
"calib/mu_w": 0.9487058823529415,
"calib/nonempty_final_conf_rate": 0.90234375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3317748917748921,
"calib/std_conf": 0.05808297881632693,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 1601.0,
"completions/max_terminated_length": 1601.0,
"completions/mean_length": 697.33203125,
"completions/mean_terminated_length": 772.8008422851562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 432.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.4685302674770355,
"learning_rate": 2.388888888888889e-06,
"loss": -0.1102,
"num_tokens": 30961042.0,
"reward": 1.121100664138794,
"reward_std": 0.5016540288925171,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.599951982498169,
"rewards/format_reward_step": 0.90234375,
"rewards/stepwise_brier_reward": 0.6423114538192749,
"step": 115
},
{
"calib/answer_extract_rate": 0.953125,
"calib/avg_num_step_conf": 6.71875,
"calib/ece": 0.2640163934426233,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.8934426229508197,
"calib/gap": 0.029723370429252882,
"calib/mean_conf": 0.9605737704918036,
"calib/mu_c": 0.9695882352941179,
"calib/mu_w": 0.939864864864865,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2639344262295085,
"calib/std_conf": 0.06025627210843981,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2797.0,
"completions/max_terminated_length": 2797.0,
"completions/mean_length": 721.35546875,
"completions/mean_terminated_length": 750.6788330078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 257.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.43929150700569153,
"learning_rate": 2.361111111111111e-06,
"loss": -0.0408,
"num_tokens": 31273269.0,
"reward": 1.2943837642669678,
"reward_std": 0.3709738850593567,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.6938859224319458,
"rewards/format_reward_step": 0.953125,
"rewards/stepwise_brier_reward": 0.7522628307342529,
"step": 116
},
{
"calib/answer_extract_rate": 0.92578125,
"calib/avg_num_step_conf": 6.43359375,
"calib/ece": 0.348649789029536,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.869198312236287,
"calib/gap": 0.037708333333333344,
"calib/mean_conf": 0.9562447257383969,
"calib/mu_c": 0.9710416666666669,
"calib/mu_w": 0.9333333333333336,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.348649789029536,
"calib/std_conf": 0.0659163837386271,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 1868.0,
"completions/max_terminated_length": 1868.0,
"completions/mean_length": 687.9375,
"completions/mean_terminated_length": 743.0885620117188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 393.0,
"epoch": 0.1248,
"grad_norm": 0.35367223620414734,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.118,
"num_tokens": 31579021.0,
"reward": 1.1236909627914429,
"reward_std": 0.45129209756851196,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6051425933837891,
"rewards/format_reward_step": 0.92578125,
"rewards/stepwise_brier_reward": 0.6641662120819092,
"step": 117
},
{
"calib/answer_extract_rate": 0.91015625,
"calib/avg_num_step_conf": 6.203125,
"calib/ece": 0.25660944206008607,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.8454935622317596,
"calib/gap": 0.052954268822813955,
"calib/mean_conf": 0.9518884120171676,
"calib/mu_c": 0.968024691358025,
"calib/mu_w": 0.9150704225352111,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25660944206008607,
"calib/std_conf": 0.07061863663345236,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 1695.0,
"completions/max_terminated_length": 1695.0,
"completions/mean_length": 668.14453125,
"completions/mean_terminated_length": 734.0986938476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 416.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.4721923768520355,
"learning_rate": 2.305555555555556e-06,
"loss": -0.1234,
"num_tokens": 31877114.0,
"reward": 1.2356454133987427,
"reward_std": 0.41479313373565674,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6732761859893799,
"rewards/format_reward_step": 0.91015625,
"rewards/stepwise_brier_reward": 0.7007163763046265,
"step": 118
},
{
"calib/answer_extract_rate": 0.890625,
"calib/avg_num_step_conf": 5.796875,
"calib/ece": 0.24078947368421072,
"calib/final_conf_rate": 0.890625,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.8114035087719298,
"calib/gap": 0.08124145474432654,
"calib/mean_conf": 0.9320175438596494,
"calib/mu_c": 0.956603773584906,
"calib/mu_w": 0.8753623188405795,
"calib/nonempty_final_conf_rate": 0.890625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23771929824561422,
"calib/std_conf": 0.13266618062523194,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 2246.0,
"completions/max_terminated_length": 2246.0,
"completions/mean_length": 668.76171875,
"completions/mean_terminated_length": 744.36083984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 383.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.31884142756462097,
"learning_rate": 2.277777777777778e-06,
"loss": -0.1035,
"num_tokens": 32176421.0,
"reward": 1.2189477682113647,
"reward_std": 0.46411794424057007,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.6684898138046265,
"rewards/format_reward_step": 0.890625,
"rewards/stepwise_brier_reward": 0.6981865763664246,
"step": 119
},
{
"calib/answer_extract_rate": 0.88671875,
"calib/avg_num_step_conf": 5.7265625,
"calib/ece": 0.28224669603524255,
"calib/final_conf_rate": 0.88671875,
"calib/format_rate": 0.88671875,
"calib/frac_conf_gt_0.9": 0.8986784140969163,
"calib/gap": 0.04297367016545117,
"calib/mean_conf": 0.960660792951542,
"calib/mu_c": 0.9744805194805198,
"calib/mu_w": 0.9315068493150687,
"calib/nonempty_final_conf_rate": 0.88671875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.28224669603524255,
"calib/std_conf": 0.06332911311975102,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 1854.0,
"completions/max_terminated_length": 1854.0,
"completions/mean_length": 656.28125,
"completions/mean_terminated_length": 736.877197265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 357.0,
"epoch": 0.128,
"grad_norm": 0.3318020701408386,
"learning_rate": 2.25e-06,
"loss": -0.0986,
"num_tokens": 32474157.0,
"reward": 1.182955265045166,
"reward_std": 0.4572509527206421,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.635696530342102,
"rewards/format_reward_step": 0.88671875,
"rewards/stepwise_brier_reward": 0.6994906067848206,
"step": 120
},
{
"calib/answer_extract_rate": 0.8984375,
"calib/avg_num_step_conf": 5.74609375,
"calib/ece": 0.30278260869565243,
"calib/final_conf_rate": 0.8984375,
"calib/format_rate": 0.8984375,
"calib/frac_conf_gt_0.9": 0.8782608695652174,
"calib/gap": 0.04957500000000026,
"calib/mean_conf": 0.9549565217391306,
"calib/mu_c": 0.9722000000000003,
"calib/mu_w": 0.922625,
"calib/nonempty_final_conf_rate": 0.8984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30278260869565243,
"calib/std_conf": 0.07149185314539393,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 2846.0,
"completions/max_terminated_length": 2846.0,
"completions/mean_length": 673.86328125,
"completions/mean_terminated_length": 746.792236328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 373.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.36813440918922424,
"learning_rate": 2.222222222222222e-06,
"loss": -0.1444,
"num_tokens": 32774762.0,
"reward": 1.1619811058044434,
"reward_std": 0.45201846957206726,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6278820633888245,
"rewards/format_reward_step": 0.8984375,
"rewards/stepwise_brier_reward": 0.6890350580215454,
"step": 121
},
{
"calib/answer_extract_rate": 0.93359375,
"calib/avg_num_step_conf": 5.96484375,
"calib/ece": 0.2530962343096237,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.8577405857740585,
"calib/gap": 0.07657684630738548,
"calib/mean_conf": 0.9518410041841007,
"calib/mu_c": 0.9749101796407189,
"calib/mu_w": 0.8983333333333334,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2530962343096237,
"calib/std_conf": 0.07904741329596059,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2748.0,
"completions/max_terminated_length": 2748.0,
"completions/mean_length": 681.59765625,
"completions/mean_terminated_length": 730.0794677734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 383.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.4585769772529602,
"learning_rate": 2.1944444444444445e-06,
"loss": -0.0791,
"num_tokens": 33079635.0,
"reward": 1.2849833965301514,
"reward_std": 0.2783547639846802,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7015323638916016,
"rewards/format_reward_step": 0.93359375,
"rewards/stepwise_brier_reward": 0.7540565133094788,
"step": 122
},
{
"calib/answer_extract_rate": 0.90234375,
"calib/avg_num_step_conf": 5.55078125,
"calib/ece": 0.27935064935064957,
"calib/final_conf_rate": 0.90234375,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 0.7229437229437229,
"calib/gap": 0.0464257161892071,
"calib/mean_conf": 0.9233333333333336,
"calib/mu_c": 0.9392105263157896,
"calib/mu_w": 0.8927848101265825,
"calib/nonempty_final_conf_rate": 0.90234375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27233766233766254,
"calib/std_conf": 0.10917108645756951,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 2970.0,
"completions/max_terminated_length": 2970.0,
"completions/mean_length": 742.8359375,
"completions/mean_terminated_length": 823.2294311523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 310.0,
"epoch": 0.1312,
"grad_norm": 0.30729711055755615,
"learning_rate": 2.166666666666667e-06,
"loss": -0.1733,
"num_tokens": 33398129.0,
"reward": 1.1760551929473877,
"reward_std": 0.47516506910324097,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6438636779785156,
"rewards/format_reward_step": 0.90234375,
"rewards/stepwise_brier_reward": 0.6805557608604431,
"step": 123
},
{
"calib/answer_extract_rate": 0.90234375,
"calib/avg_num_step_conf": 6.02734375,
"calib/ece": 0.16931034482758647,
"calib/final_conf_rate": 0.90625,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 0.8491379310344828,
"calib/gap": 0.11721367521367587,
"calib/mean_conf": 0.9451724137931036,
"calib/mu_c": 0.9714444444444449,
"calib/mu_w": 0.854230769230769,
"calib/nonempty_final_conf_rate": 0.90625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16931034482758647,
"calib/std_conf": 0.10047287126230067,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 1871.0,
"completions/max_terminated_length": 1871.0,
"completions/mean_length": 694.0625,
"completions/mean_terminated_length": 765.862060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.4728807210922241,
"learning_rate": 2.138888888888889e-06,
"loss": -0.1137,
"num_tokens": 33705665.0,
"reward": 1.3616459369659424,
"reward_std": 0.38314908742904663,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7490648031234741,
"rewards/format_reward_step": 0.90234375,
"rewards/stepwise_brier_reward": 0.7750166654586792,
"step": 124
},
{
"calib/answer_extract_rate": 0.86328125,
"calib/avg_num_step_conf": 6.12109375,
"calib/ece": 0.21452488687782828,
"calib/final_conf_rate": 0.86328125,
"calib/format_rate": 0.86328125,
"calib/frac_conf_gt_0.9": 0.7918552036199095,
"calib/gap": 0.042329985352584365,
"calib/mean_conf": 0.9408597285067877,
"calib/mu_c": 0.9521604938271606,
"calib/mu_w": 0.9098305084745762,
"calib/nonempty_final_conf_rate": 0.86328125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21117647058823552,
"calib/std_conf": 0.08976935381851739,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1328125,
"completions/max_length": 2943.0,
"completions/max_terminated_length": 2943.0,
"completions/mean_length": 675.3046875,
"completions/mean_terminated_length": 778.729736328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 313.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.4558548331260681,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.1445,
"num_tokens": 34006391.0,
"reward": 1.2194066047668457,
"reward_std": 0.4491131007671356,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6643987894058228,
"rewards/format_reward_step": 0.86328125,
"rewards/stepwise_brier_reward": 0.6722660064697266,
"step": 125
},
{
"calib/answer_extract_rate": 0.90625,
"calib/avg_num_step_conf": 6.60546875,
"calib/ece": 0.3132327586206898,
"calib/final_conf_rate": 0.90625,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 0.6767241379310345,
"calib/gap": 0.08548674606223583,
"calib/mean_conf": 0.8980603448275863,
"calib/mu_c": 0.933065693430657,
"calib/mu_w": 0.8475789473684212,
"calib/nonempty_final_conf_rate": 0.90625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3103879310344829,
"calib/std_conf": 0.1507771127784722,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2082.0,
"completions/max_terminated_length": 2082.0,
"completions/mean_length": 750.828125,
"completions/mean_terminated_length": 828.5,
"completions/min_length": 0.0,
"completions/min_terminated_length": 388.0,
"epoch": 0.1344,
"grad_norm": 0.34735116362571716,
"learning_rate": 2.0833333333333334e-06,
"loss": -0.1487,
"num_tokens": 34327107.0,
"reward": 1.1019737720489502,
"reward_std": 0.4112528860569,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6155366897583008,
"rewards/format_reward_step": 0.90234375,
"rewards/stepwise_brier_reward": 0.675259530544281,
"step": 126
},
{
"calib/answer_extract_rate": 0.828125,
"calib/avg_num_step_conf": 6.5546875,
"calib/ece": 0.2346226415094343,
"calib/final_conf_rate": 0.828125,
"calib/format_rate": 0.828125,
"calib/frac_conf_gt_0.9": 0.7216981132075472,
"calib/gap": 0.04985702614379106,
"calib/mean_conf": 0.9101886792452832,
"calib/mu_c": 0.9261805555555556,
"calib/mu_w": 0.8763235294117645,
"calib/nonempty_final_conf_rate": 0.828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23278301886792485,
"calib/std_conf": 0.13095787261612948,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2102.0,
"completions/max_terminated_length": 2102.0,
"completions/mean_length": 708.14453125,
"completions/mean_terminated_length": 855.117919921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 456.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.4462392330169678,
"learning_rate": 2.0555555555555555e-06,
"loss": -0.2065,
"num_tokens": 34635104.0,
"reward": 1.1081023216247559,
"reward_std": 0.45589950680732727,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.60732102394104,
"rewards/format_reward_step": 0.828125,
"rewards/stepwise_brier_reward": 0.636517345905304,
"step": 127
},
{
"calib/answer_extract_rate": 0.85546875,
"calib/avg_num_step_conf": 6.9296875,
"calib/ece": 0.25981735159817365,
"calib/final_conf_rate": 0.85546875,
"calib/format_rate": 0.85546875,
"calib/frac_conf_gt_0.9": 0.6940639269406392,
"calib/gap": 0.13903062132811095,
"calib/mean_conf": 0.8853881278538814,
"calib/mu_c": 0.9374452554744526,
"calib/mu_w": 0.7984146341463416,
"calib/nonempty_final_conf_rate": 0.85546875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25981735159817365,
"calib/std_conf": 0.1690348915527457,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.14453125,
"completions/max_length": 2035.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 780.609375,
"completions/mean_terminated_length": 912.4931030273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 484.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.46783876419067383,
"learning_rate": 2.027777777777778e-06,
"loss": -0.2206,
"num_tokens": 34964644.0,
"reward": 1.098130464553833,
"reward_std": 0.49286070466041565,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6286163926124573,
"rewards/format_reward_step": 0.85546875,
"rewards/stepwise_brier_reward": 0.6524767279624939,
"step": 128
},
{
"calib/answer_extract_rate": 0.88671875,
"calib/avg_num_step_conf": 7.5625,
"calib/ece": 0.18629955947136573,
"calib/final_conf_rate": 0.88671875,
"calib/format_rate": 0.88671875,
"calib/frac_conf_gt_0.9": 0.6828193832599119,
"calib/gap": 0.0761440185830432,
"calib/mean_conf": 0.8869162995594716,
"calib/mu_c": 0.908048780487805,
"calib/mu_w": 0.8319047619047618,
"calib/nonempty_final_conf_rate": 0.88671875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17537444933920712,
"calib/std_conf": 0.16700050660309387,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11328125,
"completions/max_length": 2785.0,
"completions/max_terminated_length": 2785.0,
"completions/mean_length": 824.41015625,
"completions/mean_terminated_length": 929.7312622070312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 436.0,
"epoch": 0.1376,
"grad_norm": 0.38593611121177673,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.2057,
"num_tokens": 35301117.0,
"reward": 1.2500499486923218,
"reward_std": 0.3510856628417969,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6872902512550354,
"rewards/format_reward_step": 0.88671875,
"rewards/stepwise_brier_reward": 0.7084317803382874,
"step": 129
},
{
"calib/answer_extract_rate": 0.91015625,
"calib/avg_num_step_conf": 7.9375,
"calib/ece": 0.1750214592274681,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.7510729613733905,
"calib/gap": 0.20511996336996352,
"calib/mean_conf": 0.896051502145923,
"calib/mu_c": 0.9532738095238097,
"calib/mu_w": 0.7481538461538462,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1750214592274681,
"calib/std_conf": 0.17012451110542373,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 1702.0,
"completions/max_terminated_length": 1702.0,
"completions/mean_length": 816.72265625,
"completions/mean_terminated_length": 897.3433227539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 361.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.5547916889190674,
"learning_rate": 1.9722222222222224e-06,
"loss": -0.1407,
"num_tokens": 35638526.0,
"reward": 1.314697265625,
"reward_std": 0.34050452709198,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7479640245437622,
"rewards/format_reward_step": 0.91015625,
"rewards/stepwise_brier_reward": 0.7737981081008911,
"step": 130
},
{
"calib/answer_extract_rate": 0.83984375,
"calib/avg_num_step_conf": 7.90234375,
"calib/ece": 0.30102325581395367,
"calib/final_conf_rate": 0.83984375,
"calib/format_rate": 0.83984375,
"calib/frac_conf_gt_0.9": 0.6,
"calib/gap": 0.13081495098039242,
"calib/mean_conf": 0.8468837209302328,
"calib/mu_c": 0.9052941176470591,
"calib/mu_w": 0.7744791666666667,
"calib/nonempty_final_conf_rate": 0.83984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.29720930232558157,
"calib/std_conf": 0.19433622378374746,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16015625,
"completions/max_length": 2657.0,
"completions/max_terminated_length": 2657.0,
"completions/mean_length": 865.578125,
"completions/mean_terminated_length": 1030.641845703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 544.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.3589726686477661,
"learning_rate": 1.944444444444445e-06,
"loss": -0.1978,
"num_tokens": 35989362.0,
"reward": 0.9909417629241943,
"reward_std": 0.3934427499771118,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.58257657289505,
"rewards/format_reward_step": 0.83984375,
"rewards/stepwise_brier_reward": 0.603301465511322,
"step": 131
},
{
"calib/answer_extract_rate": 0.765625,
"calib/avg_num_step_conf": 7.29296875,
"calib/ece": 0.12122448979591849,
"calib/final_conf_rate": 0.765625,
"calib/format_rate": 0.765625,
"calib/frac_conf_gt_0.9": 0.6887755102040817,
"calib/gap": 0.20253333333333368,
"calib/mean_conf": 0.8700000000000002,
"calib/mu_c": 0.9175333333333336,
"calib/mu_w": 0.715,
"calib/nonempty_final_conf_rate": 0.765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11295918367346952,
"calib/std_conf": 0.19551345274451482,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2925.0,
"completions/max_terminated_length": 2925.0,
"completions/mean_length": 810.42578125,
"completions/mean_terminated_length": 1058.5152587890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 522.0,
"epoch": 0.1408,
"grad_norm": 0.35102054476737976,
"learning_rate": 1.916666666666667e-06,
"loss": -0.2885,
"num_tokens": 36325463.0,
"reward": 1.1497254371643066,
"reward_std": 0.6093935966491699,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6461539268493652,
"rewards/format_reward_step": 0.765625,
"rewards/stepwise_brier_reward": 0.656593918800354,
"step": 132
},
{
"calib/answer_extract_rate": 0.72265625,
"calib/avg_num_step_conf": 7.2421875,
"calib/ece": 0.2776630434782611,
"calib/final_conf_rate": 0.71875,
"calib/format_rate": 0.71875,
"calib/frac_conf_gt_0.9": 0.5760869565217391,
"calib/gap": 0.10685245115665865,
"calib/mean_conf": 0.8336413043478261,
"calib/mu_c": 0.8806796116504858,
"calib/mu_w": 0.7738271604938272,
"calib/nonempty_final_conf_rate": 0.71875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27576086956521767,
"calib/std_conf": 0.20506176596130307,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 2892.0,
"completions/max_terminated_length": 2892.0,
"completions/mean_length": 848.58203125,
"completions/mean_terminated_length": 1180.6358642578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 675.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.3158020079135895,
"learning_rate": 1.888888888888889e-06,
"loss": -0.3075,
"num_tokens": 36672084.0,
"reward": 0.8508057594299316,
"reward_std": 0.6803976893424988,
"rewards/accuracy_reward_step": 0.40234375,
"rewards/final_brier_reward_step": 0.4953535199165344,
"rewards/format_reward_step": 0.71875,
"rewards/stepwise_brier_reward": 0.5156407952308655,
"step": 133
},
{
"calib/answer_extract_rate": 0.67578125,
"calib/avg_num_step_conf": 7.18359375,
"calib/ece": 0.20092485549132955,
"calib/final_conf_rate": 0.67578125,
"calib/format_rate": 0.67578125,
"calib/frac_conf_gt_0.9": 0.5028901734104047,
"calib/gap": 0.20161716171617194,
"calib/mean_conf": 0.7610404624277458,
"calib/mu_c": 0.8449504950495053,
"calib/mu_w": 0.6433333333333333,
"calib/nonempty_final_conf_rate": 0.67578125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18907514450867058,
"calib/std_conf": 0.27433801264790336,
"calib/step_conf_rate": 0.99609375,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.32421875,
"completions/max_length": 2559.0,
"completions/max_terminated_length": 2559.0,
"completions/mean_length": 866.5546875,
"completions/mean_terminated_length": 1282.300537109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 624.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.3706478476524353,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.4715,
"num_tokens": 37025914.0,
"reward": 0.8468378782272339,
"reward_std": 0.6407945156097412,
"rewards/accuracy_reward_step": 0.39453125,
"rewards/final_brier_reward_step": 0.5057078003883362,
"rewards/format_reward_step": 0.67578125,
"rewards/stepwise_brier_reward": 0.5274984836578369,
"step": 134
},
{
"calib/answer_extract_rate": 0.73828125,
"calib/avg_num_step_conf": 8.42578125,
"calib/ece": 0.1592592592592594,
"calib/final_conf_rate": 0.73828125,
"calib/format_rate": 0.73828125,
"calib/frac_conf_gt_0.9": 0.6878306878306878,
"calib/gap": 0.12027653003930394,
"calib/mean_conf": 0.8639153439153441,
"calib/mu_c": 0.897007299270073,
"calib/mu_w": 0.7767307692307691,
"calib/nonempty_final_conf_rate": 0.73828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1491534391534393,
"calib/std_conf": 0.19491013592760018,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.26171875,
"completions/max_length": 2817.0,
"completions/max_terminated_length": 2817.0,
"completions/mean_length": 903.921875,
"completions/mean_terminated_length": 1224.3597412109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 721.0,
"epoch": 0.144,
"grad_norm": 0.31145983934402466,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.3567,
"num_tokens": 37386238.0,
"reward": 1.0521256923675537,
"reward_std": 0.5641303062438965,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.5841398239135742,
"rewards/format_reward_step": 0.73828125,
"rewards/stepwise_brier_reward": 0.6042855978012085,
"step": 135
},
{
"calib/answer_extract_rate": 0.65234375,
"calib/avg_num_step_conf": 7.8671875,
"calib/ece": 0.13646706586826352,
"calib/final_conf_rate": 0.65234375,
"calib/format_rate": 0.65234375,
"calib/frac_conf_gt_0.9": 0.5209580838323353,
"calib/gap": 0.29955586080586094,
"calib/mean_conf": 0.7503592814371257,
"calib/mu_c": 0.8633653846153848,
"calib/mu_w": 0.5638095238095239,
"calib/nonempty_final_conf_rate": 0.65234375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1320359281437126,
"calib/std_conf": 0.2748628230875383,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 3006.0,
"completions/max_terminated_length": 3006.0,
"completions/mean_length": 913.390625,
"completions/mean_terminated_length": 1391.8333740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 785.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.45474621653556824,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.4577,
"num_tokens": 37751594.0,
"reward": 0.8631396293640137,
"reward_std": 0.5096396207809448,
"rewards/accuracy_reward_step": 0.40625,
"rewards/final_brier_reward_step": 0.5309988260269165,
"rewards/format_reward_step": 0.65234375,
"rewards/stepwise_brier_reward": 0.5046233534812927,
"step": 136
},
{
"calib/answer_extract_rate": 0.65625,
"calib/avg_num_step_conf": 8.36328125,
"calib/ece": 0.15392857142857158,
"calib/final_conf_rate": 0.65625,
"calib/format_rate": 0.65625,
"calib/frac_conf_gt_0.9": 0.5476190476190477,
"calib/gap": 0.16138763197586747,
"calib/mean_conf": 0.8069047619047619,
"calib/mu_c": 0.8558974358974362,
"calib/mu_w": 0.6945098039215687,
"calib/nonempty_final_conf_rate": 0.65625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1322023809523811,
"calib/std_conf": 0.23754686851067733,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2553.0,
"completions/max_terminated_length": 2553.0,
"completions/mean_length": 942.3671875,
"completions/mean_terminated_length": 1435.9881591796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 798.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.35132110118865967,
"learning_rate": 1.777777777777778e-06,
"loss": -0.4206,
"num_tokens": 38122864.0,
"reward": 0.910294234752655,
"reward_std": 0.5566987991333008,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.5172500014305115,
"rewards/format_reward_step": 0.65625,
"rewards/stepwise_brier_reward": 0.5160518884658813,
"step": 137
},
{
"calib/answer_extract_rate": 0.734375,
"calib/avg_num_step_conf": 9.47265625,
"calib/ece": 0.19737967914438515,
"calib/final_conf_rate": 0.73046875,
"calib/format_rate": 0.7265625,
"calib/frac_conf_gt_0.9": 0.6737967914438503,
"calib/gap": 0.1671767741935487,
"calib/mean_conf": 0.8446524064171124,
"calib/mu_c": 0.9000800000000002,
"calib/mu_w": 0.7329032258064515,
"calib/nonempty_final_conf_rate": 0.73046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18679144385026747,
"calib/std_conf": 0.22550540274089256,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.265625,
"completions/max_length": 2915.0,
"completions/max_terminated_length": 2915.0,
"completions/mean_length": 1003.65234375,
"completions/mean_terminated_length": 1366.675537109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 584.0,
"epoch": 0.1472,
"grad_norm": 0.48376113176345825,
"learning_rate": 1.75e-06,
"loss": -0.3257,
"num_tokens": 38507175.0,
"reward": 0.9776197671890259,
"reward_std": 0.5277310609817505,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.5589753985404968,
"rewards/format_reward_step": 0.7265625,
"rewards/stepwise_brier_reward": 0.5487784743309021,
"step": 138
},
{
"calib/answer_extract_rate": 0.81640625,
"calib/avg_num_step_conf": 10.5390625,
"calib/ece": 0.2093779904306221,
"calib/final_conf_rate": 0.81640625,
"calib/format_rate": 0.81640625,
"calib/frac_conf_gt_0.9": 0.7177033492822966,
"calib/gap": 0.11303598858898434,
"calib/mean_conf": 0.8696650717703351,
"calib/mu_c": 0.9031972789115649,
"calib/mu_w": 0.7901612903225805,
"calib/nonempty_final_conf_rate": 0.81640625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1878468899521532,
"calib/std_conf": 0.2074667962714477,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.18359375,
"completions/max_length": 2460.0,
"completions/max_terminated_length": 2460.0,
"completions/mean_length": 1105.13671875,
"completions/mean_terminated_length": 1353.6602783203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 751.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.26218655705451965,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.2185,
"num_tokens": 38916226.0,
"reward": 1.1242291927337646,
"reward_std": 0.49970123171806335,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6268507838249207,
"rewards/format_reward_step": 0.81640625,
"rewards/stepwise_brier_reward": 0.6197774410247803,
"step": 139
},
{
"calib/answer_extract_rate": 0.80078125,
"calib/avg_num_step_conf": 10.57421875,
"calib/ece": 0.17700980392156873,
"calib/final_conf_rate": 0.796875,
"calib/format_rate": 0.796875,
"calib/frac_conf_gt_0.9": 0.6470588235294118,
"calib/gap": 0.15736183278223914,
"calib/mean_conf": 0.8227941176470589,
"calib/mu_c": 0.8675342465753425,
"calib/mu_w": 0.7101724137931034,
"calib/nonempty_final_conf_rate": 0.796875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14205882352941188,
"calib/std_conf": 0.2532756708718664,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.203125,
"completions/max_length": 2853.0,
"completions/max_terminated_length": 2853.0,
"completions/mean_length": 1113.84375,
"completions/mean_terminated_length": 1397.7647705078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 733.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.2385905385017395,
"learning_rate": 1.6944444444444446e-06,
"loss": -0.3415,
"num_tokens": 39329426.0,
"reward": 1.120464563369751,
"reward_std": 0.5710935592651367,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6254988312721252,
"rewards/format_reward_step": 0.796875,
"rewards/stepwise_brier_reward": 0.6308607459068298,
"step": 140
},
{
"calib/answer_extract_rate": 0.765625,
"calib/avg_num_step_conf": 9.63671875,
"calib/ece": 0.09076530612244921,
"calib/final_conf_rate": 0.765625,
"calib/format_rate": 0.765625,
"calib/frac_conf_gt_0.9": 0.6224489795918368,
"calib/gap": 0.277523923444976,
"calib/mean_conf": 0.7995408163265308,
"calib/mu_c": 0.8618421052631579,
"calib/mu_w": 0.5843181818181818,
"calib/nonempty_final_conf_rate": 0.765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05739795918367369,
"calib/std_conf": 0.2624225704410439,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2658.0,
"completions/max_terminated_length": 2658.0,
"completions/mean_length": 1123.88671875,
"completions/mean_terminated_length": 1467.93359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 812.0,
"epoch": 0.1504,
"grad_norm": 0.3722957372665405,
"learning_rate": 1.6666666666666667e-06,
"loss": -0.3131,
"num_tokens": 39747277.0,
"reward": 1.155234694480896,
"reward_std": 0.5611724853515625,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6531496047973633,
"rewards/format_reward_step": 0.765625,
"rewards/stepwise_brier_reward": 0.6333895921707153,
"step": 141
},
{
"calib/answer_extract_rate": 0.81640625,
"calib/avg_num_step_conf": 10.171875,
"calib/ece": 0.15181818181818202,
"calib/final_conf_rate": 0.81640625,
"calib/format_rate": 0.81640625,
"calib/frac_conf_gt_0.9": 0.5741626794258373,
"calib/gap": 0.23373545621555436,
"calib/mean_conf": 0.7832057416267943,
"calib/mu_c": 0.8626086956521739,
"calib/mu_w": 0.6288732394366195,
"calib/nonempty_final_conf_rate": 0.81640625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13736842105263175,
"calib/std_conf": 0.26833486321890476,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1796875,
"completions/max_length": 2798.0,
"completions/max_terminated_length": 2798.0,
"completions/mean_length": 1136.8828125,
"completions/mean_terminated_length": 1385.914306640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 826.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.32148581743240356,
"learning_rate": 1.638888888888889e-06,
"loss": -0.2852,
"num_tokens": 40166519.0,
"reward": 1.1054890155792236,
"reward_std": 0.48387736082077026,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6477667689323425,
"rewards/format_reward_step": 0.81640625,
"rewards/stepwise_brier_reward": 0.6436101198196411,
"step": 142
},
{
"calib/answer_extract_rate": 0.81640625,
"calib/avg_num_step_conf": 10.2578125,
"calib/ece": 0.1689557416267945,
"calib/final_conf_rate": 0.81640625,
"calib/format_rate": 0.81640625,
"calib/frac_conf_gt_0.9": 0.5598086124401914,
"calib/gap": 0.2877036915793715,
"calib/mean_conf": 0.7551590909090911,
"calib/mu_c": 0.8556488970588235,
"calib/mu_w": 0.567945205479452,
"calib/nonempty_final_conf_rate": 0.81640625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13669856459330168,
"calib/std_conf": 0.3021770587579477,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.18359375,
"completions/max_length": 2989.0,
"completions/max_terminated_length": 2989.0,
"completions/mean_length": 1127.78125,
"completions/mean_terminated_length": 1381.3970947265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 857.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.2856874465942383,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.2282,
"num_tokens": 40585607.0,
"reward": 1.1005263328552246,
"reward_std": 0.429995059967041,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6541682481765747,
"rewards/format_reward_step": 0.81640625,
"rewards/stepwise_brier_reward": 0.6422065496444702,
"step": 143
},
{
"calib/answer_extract_rate": 0.80078125,
"calib/avg_num_step_conf": 10.16015625,
"calib/ece": 0.12360975609756103,
"calib/final_conf_rate": 0.80078125,
"calib/format_rate": 0.80078125,
"calib/frac_conf_gt_0.9": 0.6146341463414634,
"calib/gap": 0.24458620689655175,
"calib/mean_conf": 0.792,
"calib/mu_c": 0.8635862068965517,
"calib/mu_w": 0.619,
"calib/nonempty_final_conf_rate": 0.80078125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10414634146341469,
"calib/std_conf": 0.27203550700813073,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.19921875,
"completions/max_length": 2964.0,
"completions/max_terminated_length": 2964.0,
"completions/mean_length": 1163.8046875,
"completions/mean_terminated_length": 1453.3365478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 778.0,
"epoch": 0.1536,
"grad_norm": 0.7753714323043823,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.3242,
"num_tokens": 41010709.0,
"reward": 1.1325209140777588,
"reward_std": 0.5945384502410889,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6510945558547974,
"rewards/format_reward_step": 0.80078125,
"rewards/stepwise_brier_reward": 0.6419569253921509,
"step": 144
},
{
"calib/answer_extract_rate": 0.83984375,
"calib/avg_num_step_conf": 10.0625,
"calib/ece": 0.16651162790697696,
"calib/final_conf_rate": 0.83984375,
"calib/format_rate": 0.83984375,
"calib/frac_conf_gt_0.9": 0.5627906976744186,
"calib/gap": 0.16140656262505015,
"calib/mean_conf": 0.7806511627906978,
"calib/mu_c": 0.8317006802721089,
"calib/mu_w": 0.6702941176470587,
"calib/nonempty_final_conf_rate": 0.83984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13172093023255835,
"calib/std_conf": 0.2668917456460517,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16015625,
"completions/max_length": 2867.0,
"completions/max_terminated_length": 2867.0,
"completions/mean_length": 1172.73828125,
"completions/mean_terminated_length": 1396.376708984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 747.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.3314751386642456,
"learning_rate": 1.5555555555555558e-06,
"loss": -0.1993,
"num_tokens": 41436674.0,
"reward": 1.1464898586273193,
"reward_std": 0.45233646035194397,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6491436958312988,
"rewards/format_reward_step": 0.83984375,
"rewards/stepwise_brier_reward": 0.6548593640327454,
"step": 145
},
{
"calib/answer_extract_rate": 0.87109375,
"calib/avg_num_step_conf": 9.67578125,
"calib/ece": 0.21000000000000008,
"calib/final_conf_rate": 0.87109375,
"calib/format_rate": 0.87109375,
"calib/frac_conf_gt_0.9": 0.484304932735426,
"calib/gap": 0.1960435835351091,
"calib/mean_conf": 0.7188789237668164,
"calib/mu_c": 0.8111864406779663,
"calib/mu_w": 0.6151428571428572,
"calib/nonempty_final_conf_rate": 0.87109375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.199865470852018,
"calib/std_conf": 0.30422009181563364,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.12890625,
"completions/max_length": 2949.0,
"completions/max_terminated_length": 2949.0,
"completions/mean_length": 1151.5625,
"completions/mean_terminated_length": 1321.97314453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 799.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.295030802488327,
"learning_rate": 1.527777777777778e-06,
"loss": -0.1696,
"num_tokens": 41861730.0,
"reward": 1.019819974899292,
"reward_std": 0.4136025905609131,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.6271792650222778,
"rewards/format_reward_step": 0.87109375,
"rewards/stepwise_brier_reward": 0.632733941078186,
"step": 146
},
{
"calib/answer_extract_rate": 0.90234375,
"calib/avg_num_step_conf": 10.02734375,
"calib/ece": 0.2022510822510824,
"calib/final_conf_rate": 0.90234375,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 0.5497835497835498,
"calib/gap": 0.18307345861158586,
"calib/mean_conf": 0.7513419913419914,
"calib/mu_c": 0.8258394160583943,
"calib/mu_w": 0.6427659574468084,
"calib/nonempty_final_conf_rate": 0.90234375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1802597402597404,
"calib/std_conf": 0.2916076055181595,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 2747.0,
"completions/max_terminated_length": 2747.0,
"completions/mean_length": 1135.1953125,
"completions/mean_terminated_length": 1258.052001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 756.0,
"epoch": 0.1568,
"grad_norm": 0.2905469238758087,
"learning_rate": 1.5e-06,
"loss": -0.1829,
"num_tokens": 42279060.0,
"reward": 1.1319208145141602,
"reward_std": 0.4488699436187744,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6649765372276306,
"rewards/format_reward_step": 0.90234375,
"rewards/stepwise_brier_reward": 0.6961677074432373,
"step": 147
},
{
"calib/answer_extract_rate": 0.921875,
"calib/avg_num_step_conf": 9.98046875,
"calib/ece": 0.1297033898305086,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.5932203389830508,
"calib/gap": 0.2493200309957383,
"calib/mean_conf": 0.7858050847457629,
"calib/mu_c": 0.8470786516853934,
"calib/mu_w": 0.5977586206896551,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0806355932203391,
"calib/std_conf": 0.27004979950267227,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2720.0,
"completions/max_terminated_length": 2720.0,
"completions/mean_length": 1095.41015625,
"completions/mean_terminated_length": 1188.2415771484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 546.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.3573284447193146,
"learning_rate": 1.4722222222222225e-06,
"loss": -0.134,
"num_tokens": 42687637.0,
"reward": 1.3620367050170898,
"reward_std": 0.4463733434677124,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.7680534720420837,
"rewards/format_reward_step": 0.921875,
"rewards/stepwise_brier_reward": 0.7620397806167603,
"step": 148
},
{
"calib/answer_extract_rate": 0.85546875,
"calib/avg_num_step_conf": 9.08203125,
"calib/ece": 0.14388127853881297,
"calib/final_conf_rate": 0.85546875,
"calib/format_rate": 0.85546875,
"calib/frac_conf_gt_0.9": 0.6301369863013698,
"calib/gap": 0.29610000000000014,
"calib/mean_conf": 0.7777625570776258,
"calib/mu_c": 0.8791666666666669,
"calib/mu_w": 0.5830666666666667,
"calib/nonempty_final_conf_rate": 0.85546875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1320547945205481,
"calib/std_conf": 0.2957337724521902,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.14453125,
"completions/max_length": 3041.0,
"completions/max_terminated_length": 3041.0,
"completions/mean_length": 1135.609375,
"completions/mean_terminated_length": 1327.47021484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 682.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.3924039900302887,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.1958,
"num_tokens": 43105849.0,
"reward": 1.1683149337768555,
"reward_std": 0.46608710289001465,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6897277235984802,
"rewards/format_reward_step": 0.85546875,
"rewards/stepwise_brier_reward": 0.7016169428825378,
"step": 149
},
{
"calib/answer_extract_rate": 0.91015625,
"calib/avg_num_step_conf": 9.7421875,
"calib/ece": 0.16900862068965544,
"calib/final_conf_rate": 0.90625,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.7586206896551724,
"calib/gap": 0.22897058823529437,
"calib/mean_conf": 0.8653879310344827,
"calib/mu_c": 0.9325000000000002,
"calib/mu_w": 0.7035294117647058,
"calib/nonempty_final_conf_rate": 0.90625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16375000000000028,
"calib/std_conf": 0.2334354105289573,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 2899.0,
"completions/max_terminated_length": 2899.0,
"completions/mean_length": 1121.7265625,
"completions/mean_terminated_length": 1232.4549560546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 745.0,
"epoch": 0.16,
"grad_norm": 0.4510607123374939,
"learning_rate": 1.4166666666666667e-06,
"loss": -0.1138,
"num_tokens": 43521011.0,
"reward": 1.2876085042953491,
"reward_std": 0.33042508363723755,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7323199510574341,
"rewards/format_reward_step": 0.90625,
"rewards/stepwise_brier_reward": 0.7451692819595337,
"step": 150
},
{
"calib/answer_extract_rate": 0.9140625,
"calib/avg_num_step_conf": 9.40625,
"calib/ece": 0.179871794871795,
"calib/final_conf_rate": 0.9140625,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 0.5555555555555556,
"calib/gap": 0.21956062461726877,
"calib/mean_conf": 0.7550854700854701,
"calib/mu_c": 0.8414084507042254,
"calib/mu_w": 0.6218478260869567,
"calib/nonempty_final_conf_rate": 0.9140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1640598290598292,
"calib/std_conf": 0.28199266743966284,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 2663.0,
"completions/max_terminated_length": 2663.0,
"completions/mean_length": 1147.74609375,
"completions/mean_terminated_length": 1255.6539306640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 702.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.3453322947025299,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.1646,
"num_tokens": 43944898.0,
"reward": 1.1735072135925293,
"reward_std": 0.3468888998031616,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6989699006080627,
"rewards/format_reward_step": 0.9140625,
"rewards/stepwise_brier_reward": 0.7117139101028442,
"step": 151
},
{
"calib/answer_extract_rate": 0.85546875,
"calib/avg_num_step_conf": 8.62890625,
"calib/ece": 0.17242009132420114,
"calib/final_conf_rate": 0.85546875,
"calib/format_rate": 0.85546875,
"calib/frac_conf_gt_0.9": 0.5342465753424658,
"calib/gap": 0.27221547536433044,
"calib/mean_conf": 0.7330593607305937,
"calib/mu_c": 0.8424427480916031,
"calib/mu_w": 0.5702272727272727,
"calib/nonempty_final_conf_rate": 0.85546875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1536529680365299,
"calib/std_conf": 0.2954947111712576,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 2931.0,
"completions/max_terminated_length": 2931.0,
"completions/mean_length": 1103.75,
"completions/mean_terminated_length": 1284.3636474609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 759.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.3742715120315552,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.1866,
"num_tokens": 44355890.0,
"reward": 1.101632833480835,
"reward_std": 0.4346971809864044,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6715320348739624,
"rewards/format_reward_step": 0.85546875,
"rewards/stepwise_brier_reward": 0.6744046807289124,
"step": 152
},
{
"calib/answer_extract_rate": 0.91015625,
"calib/avg_num_step_conf": 9.0625,
"calib/ece": 0.20343347639485,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.630901287553648,
"calib/gap": 0.16771658721025817,
"calib/mean_conf": 0.7779399141630904,
"calib/mu_c": 0.834805194805195,
"calib/mu_w": 0.6670886075949368,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16021459227467832,
"calib/std_conf": 0.2906379778314124,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 2964.0,
"completions/max_terminated_length": 2964.0,
"completions/mean_length": 1091.0546875,
"completions/mean_terminated_length": 1193.632568359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 533.0,
"epoch": 0.1632,
"grad_norm": 0.3894638419151306,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.1133,
"num_tokens": 44765560.0,
"reward": 1.2110968828201294,
"reward_std": 0.3976651132106781,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6852694749832153,
"rewards/format_reward_step": 0.91015625,
"rewards/stepwise_brier_reward": 0.703535795211792,
"step": 153
},
{
"calib/answer_extract_rate": 0.92578125,
"calib/avg_num_step_conf": 8.95703125,
"calib/ece": 0.24582278481012673,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.6455696202531646,
"calib/gap": 0.1993506493506494,
"calib/mean_conf": 0.798649789029536,
"calib/mu_c": 0.8869696969696971,
"calib/mu_w": 0.6876190476190477,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24375527426160354,
"calib/std_conf": 0.2748146128667527,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2835.0,
"completions/max_terminated_length": 2835.0,
"completions/mean_length": 1094.3203125,
"completions/mean_terminated_length": 1182.050537109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 706.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.3308521509170532,
"learning_rate": 1.3055555555555556e-06,
"loss": -0.0203,
"num_tokens": 45173186.0,
"reward": 1.1115479469299316,
"reward_std": 0.33493223786354065,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6644241809844971,
"rewards/format_reward_step": 0.92578125,
"rewards/stepwise_brier_reward": 0.6845309734344482,
"step": 154
},
{
"calib/answer_extract_rate": 0.9375,
"calib/avg_num_step_conf": 8.796875,
"calib/ece": 0.22666666666666685,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.625,
"calib/gap": 0.14535439982444576,
"calib/mean_conf": 0.7974166666666669,
"calib/mu_c": 0.8537414965986394,
"calib/mu_w": 0.7083870967741936,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20579166666666684,
"calib/std_conf": 0.26533625155430396,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2286.0,
"completions/max_terminated_length": 2286.0,
"completions/mean_length": 1043.34375,
"completions/mean_terminated_length": 1112.9000244140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 592.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.3368653953075409,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.0878,
"num_tokens": 45570538.0,
"reward": 1.1853913068771362,
"reward_std": 0.4266975522041321,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6816155910491943,
"rewards/format_reward_step": 0.9375,
"rewards/stepwise_brier_reward": 0.7064590454101562,
"step": 155
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 8.80859375,
"calib/ece": 0.254857142857143,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.6693877551020408,
"calib/gap": 0.13213520749665342,
"calib/mean_conf": 0.7913469387755104,
"calib/mu_c": 0.8361111111111112,
"calib/mu_w": 0.7039759036144578,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19248979591836748,
"calib/std_conf": 0.3032884766054413,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2529.0,
"completions/max_terminated_length": 2529.0,
"completions/mean_length": 1077.88671875,
"completions/mean_terminated_length": 1126.2816162109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 621.0,
"epoch": 0.1664,
"grad_norm": 0.32788556814193726,
"learning_rate": 1.25e-06,
"loss": -0.0551,
"num_tokens": 45974277.0,
"reward": 1.264580249786377,
"reward_std": 0.3429688513278961,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6950687170028687,
"rewards/format_reward_step": 0.95703125,
"rewards/stepwise_brier_reward": 0.7541208863258362,
"step": 156
},
{
"calib/answer_extract_rate": 0.91796875,
"calib/avg_num_step_conf": 8.62890625,
"calib/ece": 0.13868085106383005,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.7404255319148936,
"calib/gap": 0.23696136408436852,
"calib/mean_conf": 0.8524680851063832,
"calib/mu_c": 0.9099438202247194,
"calib/mu_w": 0.6729824561403509,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11685106382978751,
"calib/std_conf": 0.24720366752287212,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 2307.0,
"completions/max_terminated_length": 2307.0,
"completions/mean_length": 979.1015625,
"completions/mean_terminated_length": 1066.595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 497.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.3304344415664673,
"learning_rate": 1.2222222222222223e-06,
"loss": -0.0712,
"num_tokens": 46351695.0,
"reward": 1.3637332916259766,
"reward_std": 0.33922278881073,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.7648605108261108,
"rewards/format_reward_step": 0.91796875,
"rewards/stepwise_brier_reward": 0.7767749428749084,
"step": 157
},
{
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 8.87109375,
"calib/ece": 0.21689516129032282,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.8024193548387096,
"calib/gap": 0.11029681762545929,
"calib/mean_conf": 0.897943548387097,
"calib/mu_c": 0.9317441860465118,
"calib/mu_w": 0.8214473684210525,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21064516129032285,
"calib/std_conf": 0.18728375928156385,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2339.0,
"completions/max_terminated_length": 2339.0,
"completions/mean_length": 987.1328125,
"completions/mean_terminated_length": 1018.9757690429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 651.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.600447952747345,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.0271,
"num_tokens": 46732681.0,
"reward": 1.3191943168640137,
"reward_std": 0.39712393283843994,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.7299152612686157,
"rewards/format_reward_step": 0.96484375,
"rewards/stepwise_brier_reward": 0.7435094118118286,
"step": 158
},
{
"calib/answer_extract_rate": 0.93359375,
"calib/avg_num_step_conf": 8.609375,
"calib/ece": 0.26418410041841023,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.7907949790794979,
"calib/gap": 0.12414196686426482,
"calib/mean_conf": 0.8959832635983266,
"calib/mu_c": 0.9406535947712419,
"calib/mu_w": 0.8165116279069771,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2600000000000002,
"calib/std_conf": 0.18460796336643473,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2154.0,
"completions/max_terminated_length": 2154.0,
"completions/mean_length": 936.7421875,
"completions/mean_terminated_length": 1003.3723754882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 632.0,
"epoch": 0.1696,
"grad_norm": 0.3773444592952728,
"learning_rate": 1.1666666666666668e-06,
"loss": -0.0523,
"num_tokens": 47100311.0,
"reward": 1.2171008586883545,
"reward_std": 0.3303322494029999,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6760132908821106,
"rewards/format_reward_step": 0.9296875,
"rewards/stepwise_brier_reward": 0.7538768649101257,
"step": 159
},
{
"calib/answer_extract_rate": 0.9453125,
"calib/avg_num_step_conf": 8.66796875,
"calib/ece": 0.2105371900826447,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.8016528925619835,
"calib/gap": 0.08989621942179415,
"calib/mean_conf": 0.9030991735537193,
"calib/mu_c": 0.9294736842105265,
"calib/mu_w": 0.8395774647887323,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20351239669421495,
"calib/std_conf": 0.17958141895077795,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2108.0,
"completions/max_terminated_length": 2108.0,
"completions/mean_length": 963.9296875,
"completions/mean_terminated_length": 1019.6941528320312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 599.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.3580320477485657,
"learning_rate": 1.138888888888889e-06,
"loss": -0.0707,
"num_tokens": 47474957.0,
"reward": 1.3058538436889648,
"reward_std": 0.4367513656616211,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7175910472869873,
"rewards/format_reward_step": 0.9453125,
"rewards/stepwise_brier_reward": 0.7382331490516663,
"step": 160
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 8.90625,
"calib/ece": 0.15040485829959527,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.8461538461538461,
"calib/gap": 0.12649431818181822,
"calib/mean_conf": 0.9221457489878544,
"calib/mu_c": 0.9503125,
"calib/mu_w": 0.8238181818181818,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1476113360323888,
"calib/std_conf": 0.1610663196451516,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2594.0,
"completions/max_terminated_length": 2594.0,
"completions/mean_length": 976.9375,
"completions/mean_terminated_length": 1012.534423828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 644.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.34767386317253113,
"learning_rate": 1.111111111111111e-06,
"loss": -0.0098,
"num_tokens": 47852013.0,
"reward": 1.4458568096160889,
"reward_std": 0.3386097550392151,
"rewards/accuracy_reward_step": 0.75,
"rewards/final_brier_reward_step": 0.7948245406150818,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.8094027638435364,
"step": 161
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 9.2734375,
"calib/ece": 0.21533864541832684,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.8844621513944223,
"calib/gap": 0.11473164516642786,
"calib/mean_conf": 0.9404382470119522,
"calib/mu_c": 0.9719780219780222,
"calib/mu_w": 0.8572463768115943,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21533864541832684,
"calib/std_conf": 0.1442255288036068,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1632.0,
"completions/max_terminated_length": 1632.0,
"completions/mean_length": 918.28125,
"completions/mean_terminated_length": 936.57373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 411.0,
"epoch": 0.1728,
"grad_norm": 0.3539327383041382,
"learning_rate": 1.0833333333333335e-06,
"loss": -0.0135,
"num_tokens": 48214277.0,
"reward": 1.3857910633087158,
"reward_std": 0.2928190231323242,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.7601113319396973,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.788566529750824,
"step": 162
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 8.9921875,
"calib/ece": 0.2649795918367348,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.7387755102040816,
"calib/gap": 0.13937535370684762,
"calib/mean_conf": 0.879265306122449,
"calib/mu_c": 0.932171052631579,
"calib/mu_w": 0.7927956989247313,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2619183673469389,
"calib/std_conf": 0.1996043817266999,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2537.0,
"completions/max_terminated_length": 2537.0,
"completions/mean_length": 955.48046875,
"completions/mean_terminated_length": 998.3795776367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 572.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.38675457239151,
"learning_rate": 1.0555555555555557e-06,
"loss": -0.0548,
"num_tokens": 48586752.0,
"reward": 1.2166765928268433,
"reward_std": 0.3924652338027954,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6922163963317871,
"rewards/format_reward_step": 0.95703125,
"rewards/stepwise_brier_reward": 0.7244610786437988,
"step": 163
},
{
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 9.140625,
"calib/ece": 0.25266129032258083,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.7983870967741935,
"calib/gap": 0.10638242894056837,
"calib/mean_conf": 0.9058870967741937,
"calib/mu_c": 0.9427777777777779,
"calib/mu_w": 0.8363953488372096,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25266129032258083,
"calib/std_conf": 0.17819214030300223,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2852.0,
"completions/max_terminated_length": 2852.0,
"completions/mean_length": 979.5625,
"completions/mean_terminated_length": 1011.1612548828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 532.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.36825188994407654,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.0458,
"num_tokens": 48966696.0,
"reward": 1.2688332796096802,
"reward_std": 0.43453603982925415,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7033937573432922,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.7497953176498413,
"step": 164
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/avg_num_step_conf": 9.37890625,
"calib/ece": 0.32975609756097557,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.8252032520325203,
"calib/gap": 0.1507761437908497,
"calib/mean_conf": 0.9151219512195123,
"calib/mu_c": 0.9776388888888891,
"calib/mu_w": 0.8268627450980394,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32975609756097557,
"calib/std_conf": 0.17408700681242176,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2404.0,
"completions/max_terminated_length": 2404.0,
"completions/mean_length": 932.90625,
"completions/mean_terminated_length": 970.8292236328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 613.0,
"epoch": 0.176,
"grad_norm": 0.2954544723033905,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0724,
"num_tokens": 49334136.0,
"reward": 1.1690062284469604,
"reward_std": 0.2975730895996094,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6644234657287598,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.7128032445907593,
"step": 165
},
{
"calib/answer_extract_rate": 0.953125,
"calib/avg_num_step_conf": 9.69921875,
"calib/ece": 0.14987704918032807,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.819672131147541,
"calib/gap": 0.25266253869969046,
"calib/mean_conf": 0.9162704918032789,
"calib/mu_c": 0.975294117647059,
"calib/mu_w": 0.7226315789473685,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14987704918032807,
"calib/std_conf": 0.16811766648691037,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2859.0,
"completions/max_terminated_length": 2859.0,
"completions/mean_length": 912.1796875,
"completions/mean_terminated_length": 953.1346435546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 573.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.3518250286579132,
"learning_rate": 9.722222222222224e-07,
"loss": -0.0472,
"num_tokens": 49696878.0,
"reward": 1.4419491291046143,
"reward_std": 0.298769474029541,
"rewards/accuracy_reward_step": 0.73046875,
"rewards/final_brier_reward_step": 0.8203636407852173,
"rewards/format_reward_step": 0.953125,
"rewards/stepwise_brier_reward": 0.8239436745643616,
"step": 166
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 10.29296875,
"calib/ece": 0.2028740157480316,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.8858267716535433,
"calib/gap": 0.09488400488400495,
"calib/mean_conf": 0.9444488188976379,
"calib/mu_c": 0.9687301587301589,
"calib/mu_w": 0.8738461538461539,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20161417322834657,
"calib/std_conf": 0.14152255370528902,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2602.0,
"completions/max_terminated_length": 2602.0,
"completions/mean_length": 893.87109375,
"completions/mean_terminated_length": 904.4703979492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 535.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.33679449558258057,
"learning_rate": 9.444444444444445e-07,
"loss": 0.0066,
"num_tokens": 50054357.0,
"reward": 1.4315710067749023,
"reward_std": 0.32214871048927307,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7794097661972046,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.8174647092819214,
"step": 167
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 10.1015625,
"calib/ece": 0.23404000000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.908,
"calib/gap": 0.1151349206349207,
"calib/mean_conf": 0.95404,
"calib/mu_c": 0.9862777777777778,
"calib/mu_w": 0.8711428571428571,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23404000000000003,
"calib/std_conf": 0.12890336845870243,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2232.0,
"completions/max_terminated_length": 2232.0,
"completions/mean_length": 904.546875,
"completions/mean_terminated_length": 926.2560424804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 557.0,
"epoch": 0.1792,
"grad_norm": 0.4689064919948578,
"learning_rate": 9.166666666666666e-07,
"loss": -0.0086,
"num_tokens": 50413633.0,
"reward": 1.373067855834961,
"reward_std": 0.4074147939682007,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7553042769432068,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.7785377502441406,
"step": 168
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 10.72265625,
"calib/ece": 0.29684000000000005,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.84,
"calib/gap": 0.1715272427077601,
"calib/mean_conf": 0.92884,
"calib/mu_c": 0.9919620253164558,
"calib/mu_w": 0.8204347826086957,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.29684000000000005,
"calib/std_conf": 0.17167834575158278,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 1703.0,
"completions/max_terminated_length": 1703.0,
"completions/mean_length": 860.4453125,
"completions/mean_terminated_length": 881.0960693359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 484.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.3696523606777191,
"learning_rate": 8.88888888888889e-07,
"loss": -0.0199,
"num_tokens": 50761131.0,
"reward": 1.2635154724121094,
"reward_std": 0.29985445737838745,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7125222682952881,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.7696424722671509,
"step": 169
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 10.53515625,
"calib/ece": 0.24538152610441774,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8755020080321285,
"calib/gap": 0.07810114942528712,
"calib/mean_conf": 0.9441767068273094,
"calib/mu_c": 0.9677011494252873,
"calib/mu_w": 0.8896000000000002,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24538152610441774,
"calib/std_conf": 0.14681431357425442,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 1884.0,
"completions/max_terminated_length": 1884.0,
"completions/mean_length": 890.81640625,
"completions/mean_terminated_length": 915.859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 608.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.35689327120780945,
"learning_rate": 8.611111111111112e-07,
"loss": -0.0341,
"num_tokens": 51116372.0,
"reward": 1.332000970840454,
"reward_std": 0.4051094353199005,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7203788757324219,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.7794337868690491,
"step": 170
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/avg_num_step_conf": 10.61328125,
"calib/ece": 0.39426829268292696,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.8699186991869918,
"calib/gap": 0.08041441441441433,
"calib/mean_conf": 0.9430487804878049,
"calib/mu_c": 0.9793333333333334,
"calib/mu_w": 0.8989189189189191,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39426829268292696,
"calib/std_conf": 0.15150751161493137,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2502.0,
"completions/max_terminated_length": 2502.0,
"completions/mean_length": 878.06640625,
"completions/mean_terminated_length": 913.7601318359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 585.0,
"epoch": 0.1824,
"grad_norm": 0.4127947688102722,
"learning_rate": 8.333333333333333e-07,
"loss": -0.0604,
"num_tokens": 51471093.0,
"reward": 1.082554817199707,
"reward_std": 0.3952634930610657,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.5898253917694092,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.6568180322647095,
"step": 171
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 11.1875,
"calib/ece": 0.24062992125984253,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9173228346456693,
"calib/gap": 0.014017182130584338,
"calib/mean_conf": 0.9650393700787402,
"calib/mu_c": 0.9683505154639176,
"calib/mu_w": 0.9543333333333333,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2209448818897638,
"calib/std_conf": 0.12837183943006472,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2567.0,
"completions/max_terminated_length": 2567.0,
"completions/mean_length": 898.984375,
"completions/mean_terminated_length": 909.644287109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 596.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.4310997724533081,
"learning_rate": 8.055555555555557e-07,
"loss": -0.0105,
"num_tokens": 51827625.0,
"reward": 1.4327988624572754,
"reward_std": 0.3014458417892456,
"rewards/accuracy_reward_step": 0.7578125,
"rewards/final_brier_reward_step": 0.7577491998672485,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.7891346216201782,
"step": 172
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 11.22265625,
"calib/ece": 0.3241224489795919,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9510204081632653,
"calib/gap": 0.024891304347825827,
"calib/mean_conf": 0.9802857142857143,
"calib/mu_c": 0.9888198757763974,
"calib/mu_w": 0.9639285714285716,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3236326530612246,
"calib/std_conf": 0.08761884057946553,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2031.0,
"completions/max_terminated_length": 2031.0,
"completions/mean_length": 836.6796875,
"completions/mean_terminated_length": 874.244873046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 466.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.36348190903663635,
"learning_rate": 7.777777777777779e-07,
"loss": -0.0406,
"num_tokens": 52168015.0,
"reward": 1.2189308404922485,
"reward_std": 0.24862341582775116,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6448589563369751,
"rewards/format_reward_step": 0.95703125,
"rewards/stepwise_brier_reward": 0.6875680685043335,
"step": 173
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/avg_num_step_conf": 10.60546875,
"calib/ece": 0.3365432098765434,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.8559670781893004,
"calib/gap": 0.09681122448979584,
"calib/mean_conf": 0.9414814814814816,
"calib/mu_c": 0.9797278911564624,
"calib/mu_w": 0.8829166666666666,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3365432098765434,
"calib/std_conf": 0.15306292133471497,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2782.0,
"completions/max_terminated_length": 2782.0,
"completions/mean_length": 898.26171875,
"completions/mean_terminated_length": 946.3168334960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 523.0,
"epoch": 0.1856,
"grad_norm": 0.4376189410686493,
"learning_rate": 7.5e-07,
"loss": -0.0946,
"num_tokens": 52525242.0,
"reward": 1.1533571481704712,
"reward_std": 0.48044002056121826,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6326359510421753,
"rewards/format_reward_step": 0.94140625,
"rewards/stepwise_brier_reward": 0.6747190356254578,
"step": 174
},
{
"calib/answer_extract_rate": 0.94140625,
"calib/avg_num_step_conf": 10.56640625,
"calib/ece": 0.37688796680497927,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.8796680497925311,
"calib/gap": 0.02313971742543175,
"calib/mean_conf": 0.9559751037344397,
"calib/mu_c": 0.9653846153846155,
"calib/mu_w": 0.9422448979591838,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3697510373443984,
"calib/std_conf": 0.1329773301526723,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2255.0,
"completions/max_terminated_length": 2255.0,
"completions/mean_length": 861.69140625,
"completions/mean_terminated_length": 911.541259765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 557.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.4053885340690613,
"learning_rate": 7.222222222222222e-07,
"loss": -0.0515,
"num_tokens": 52874699.0,
"reward": 1.1094977855682373,
"reward_std": 0.5093092918395996,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.5843409895896912,
"rewards/format_reward_step": 0.94140625,
"rewards/stepwise_brier_reward": 0.6583711504936218,
"step": 175
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 10.9921875,
"calib/ece": 0.30984313725490203,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.8823529411764706,
"calib/gap": 0.07997932782075223,
"calib/mean_conf": 0.9490588235294117,
"calib/mu_c": 0.9779141104294479,
"calib/mu_w": 0.8979347826086956,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30984313725490203,
"calib/std_conf": 0.1494115794348087,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1650.0,
"completions/max_terminated_length": 1650.0,
"completions/mean_length": 903.86328125,
"completions/mean_terminated_length": 910.9802856445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 517.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.3664768636226654,
"learning_rate": 6.944444444444446e-07,
"loss": 0.0227,
"num_tokens": 53233192.0,
"reward": 1.2727751731872559,
"reward_std": 0.27945634722709656,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.6852566599845886,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.7752746343612671,
"step": 176
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 10.94140625,
"calib/ece": 0.2897200000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.852,
"calib/gap": 0.07533333333333314,
"calib/mean_conf": 0.94372,
"calib/mu_c": 0.9693333333333333,
"calib/mu_w": 0.8940000000000001,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2867200000000001,
"calib/std_conf": 0.1444581655705208,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 1574.0,
"completions/max_terminated_length": 1574.0,
"completions/mean_length": 862.55078125,
"completions/mean_terminated_length": 883.2520141601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 450.0,
"epoch": 0.1888,
"grad_norm": 0.38255923986434937,
"learning_rate": 6.666666666666667e-07,
"loss": -0.0378,
"num_tokens": 53580877.0,
"reward": 1.2706055641174316,
"reward_std": 0.31008511781692505,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.6875433921813965,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.7401479482650757,
"step": 177
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 10.94140625,
"calib/ece": 0.2507569721115539,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.8924302788844621,
"calib/gap": 0.057092685906245166,
"calib/mean_conf": 0.9559362549800797,
"calib/mu_c": 0.9727683615819209,
"calib/mu_w": 0.9156756756756758,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2507569721115539,
"calib/std_conf": 0.13001463264384003,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1506.0,
"completions/max_terminated_length": 1506.0,
"completions/mean_length": 841.1328125,
"completions/mean_terminated_length": 854.4841918945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 531.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.37112635374069214,
"learning_rate": 6.388888888888889e-07,
"loss": -0.017,
"num_tokens": 53925319.0,
"reward": 1.3369669914245605,
"reward_std": 0.4431039094924927,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.7177726626396179,
"rewards/format_reward_step": 0.9765625,
"rewards/stepwise_brier_reward": 0.7560725808143616,
"step": 178
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 10.47265625,
"calib/ece": 0.27591836734693886,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.8857142857142857,
"calib/gap": 0.1015212749733112,
"calib/mean_conf": 0.953469387755102,
"calib/mu_c": 0.9862048192771085,
"calib/mu_w": 0.8846835443037973,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27591836734693886,
"calib/std_conf": 0.1372700454921651,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2137.0,
"completions/max_terminated_length": 2137.0,
"completions/mean_length": 858.78125,
"completions/mean_terminated_length": 897.3387451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 439.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.36745166778564453,
"learning_rate": 6.111111111111112e-07,
"loss": -0.0585,
"num_tokens": 54274471.0,
"reward": 1.281066656112671,
"reward_std": 0.4145931005477905,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.6955976486206055,
"rewards/format_reward_step": 0.953125,
"rewards/stepwise_brier_reward": 0.7580711841583252,
"step": 179
},
{
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 10.26953125,
"calib/ece": 0.2721825396825397,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8174603174603174,
"calib/gap": 0.10623385939741747,
"calib/mean_conf": 0.9333730158730158,
"calib/mu_c": 0.9679411764705883,
"calib/mu_w": 0.8617073170731708,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2654761904761905,
"calib/std_conf": 0.15649475036212718,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1696.0,
"completions/max_terminated_length": 1696.0,
"completions/mean_length": 926.98046875,
"completions/mean_terminated_length": 941.6945190429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 533.0,
"epoch": 0.192,
"grad_norm": 0.44686973094940186,
"learning_rate": 5.833333333333334e-07,
"loss": -0.0096,
"num_tokens": 54638674.0,
"reward": 1.3219671249389648,
"reward_std": 0.322432279586792,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7163659930229187,
"rewards/format_reward_step": 0.97265625,
"rewards/stepwise_brier_reward": 0.8098236918449402,
"step": 180
},
{
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 10.296875,
"calib/ece": 0.3192460317460317,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.8293650793650794,
"calib/gap": 0.1577126704356503,
"calib/mean_conf": 0.9343253968253967,
"calib/mu_c": 0.9950322580645162,
"calib/mu_w": 0.8373195876288659,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3192460317460317,
"calib/std_conf": 0.1545857921439711,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2251.0,
"completions/max_terminated_length": 2251.0,
"completions/mean_length": 891.0234375,
"completions/mean_terminated_length": 905.166748046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 590.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.3694749176502228,
"learning_rate": 5.555555555555555e-07,
"loss": -0.0261,
"num_tokens": 54996080.0,
"reward": 1.23591947555542,
"reward_std": 0.4027971625328064,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.6970745921134949,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.7354660034179688,
"step": 181
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 10.453125,
"calib/ece": 0.27925490196078434,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.8549019607843137,
"calib/gap": 0.1389244663382594,
"calib/mean_conf": 0.938078431372549,
"calib/mu_c": 0.9854761904761905,
"calib/mu_w": 0.8465517241379311,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27925490196078434,
"calib/std_conf": 0.16721898793782713,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1656.0,
"completions/max_terminated_length": 1656.0,
"completions/mean_length": 879.390625,
"completions/mean_terminated_length": 886.31494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 531.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.41442859172821045,
"learning_rate": 5.277777777777779e-07,
"loss": -0.0078,
"num_tokens": 55350404.0,
"reward": 1.3059650659561157,
"reward_std": 0.33462876081466675,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7249683737754822,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.7520487308502197,
"step": 182
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 9.91796875,
"calib/ece": 0.21244979919678725,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.8152610441767069,
"calib/gap": 0.08924490809736707,
"calib/mean_conf": 0.9210441767068273,
"calib/mu_c": 0.9446994535519125,
"calib/mu_w": 0.8554545454545455,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19927710843373503,
"calib/std_conf": 0.1821660008445034,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2061.0,
"completions/max_terminated_length": 2061.0,
"completions/mean_length": 850.04296875,
"completions/mean_terminated_length": 870.4440307617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 424.0,
"epoch": 0.1952,
"grad_norm": 0.36135947704315186,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0379,
"num_tokens": 55697735.0,
"reward": 1.369386911392212,
"reward_std": 0.38945871591567993,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7393156290054321,
"rewards/format_reward_step": 0.9453125,
"rewards/stepwise_brier_reward": 0.7614158391952515,
"step": 183
},
{
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 9.98828125,
"calib/ece": 0.20786290322580644,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.8790322580645161,
"calib/gap": 0.12473392181588894,
"calib/mean_conf": 0.941733870967742,
"calib/mu_c": 0.9744262295081967,
"calib/mu_w": 0.8496923076923077,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20584677419354838,
"calib/std_conf": 0.17850809184111946,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2501.0,
"completions/max_terminated_length": 2501.0,
"completions/mean_length": 848.4609375,
"completions/mean_terminated_length": 875.8306274414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 582.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.4687948226928711,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.0518,
"num_tokens": 56043261.0,
"reward": 1.3840022087097168,
"reward_std": 0.442663311958313,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7531074285507202,
"rewards/format_reward_step": 0.95703125,
"rewards/stepwise_brier_reward": 0.7876062393188477,
"step": 184
},
{
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 10.0390625,
"calib/ece": 0.22297619047619047,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7817460317460317,
"calib/gap": 0.2493959827833574,
"calib/mean_conf": 0.8970238095238096,
"calib/mu_c": 0.9781764705882354,
"calib/mu_w": 0.728780487804878,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2226984126984127,
"calib/std_conf": 0.214737453734278,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1970.0,
"completions/max_terminated_length": 1970.0,
"completions/mean_length": 857.51171875,
"completions/mean_terminated_length": 874.5936279296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 501.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.48540031909942627,
"learning_rate": 4.444444444444445e-07,
"loss": 0.002,
"num_tokens": 56392744.0,
"reward": 1.350165843963623,
"reward_std": 0.3727245628833771,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.778076171875,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.7960731983184814,
"step": 185
},
{
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 9.8203125,
"calib/ece": 0.21489959839357436,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.7389558232931727,
"calib/gap": 0.1803615520282188,
"calib/mean_conf": 0.8815662650602409,
"calib/mu_c": 0.9402380952380953,
"calib/mu_w": 0.7598765432098765,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21088353413654626,
"calib/std_conf": 0.2124960586133561,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 1933.0,
"completions/max_terminated_length": 1933.0,
"completions/mean_length": 888.76171875,
"completions/mean_terminated_length": 913.7469482421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 560.0,
"epoch": 0.1984,
"grad_norm": 0.3782590329647064,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.0279,
"num_tokens": 56748347.0,
"reward": 1.3173680305480957,
"reward_std": 0.3679579496383667,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7408746480941772,
"rewards/format_reward_step": 0.95703125,
"rewards/stepwise_brier_reward": 0.7799100875854492,
"step": 186
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 9.6484375,
"calib/ece": 0.24423387096774204,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.7056451612903226,
"calib/gap": 0.09330047462577595,
"calib/mean_conf": 0.8766532258064514,
"calib/mu_c": 0.907878787878788,
"calib/mu_w": 0.814578313253012,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22778225806451624,
"calib/std_conf": 0.20995508522580444,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2672.0,
"completions/max_terminated_length": 2672.0,
"completions/mean_length": 940.7578125,
"completions/mean_terminated_length": 967.2047729492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 506.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.4038413166999817,
"learning_rate": 3.8888888888888895e-07,
"loss": -0.0163,
"num_tokens": 57113765.0,
"reward": 1.2750946283340454,
"reward_std": 0.4495476484298706,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.6956921815872192,
"rewards/format_reward_step": 0.953125,
"rewards/stepwise_brier_reward": 0.7496190071105957,
"step": 187
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 10.046875,
"calib/ece": 0.1967450980392157,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.803921568627451,
"calib/gap": 0.20235303123086357,
"calib/mean_conf": 0.9109411764705884,
"calib/mu_c": 0.9672826086956523,
"calib/mu_w": 0.7649295774647887,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19305882352941175,
"calib/std_conf": 0.19630882150972606,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1942.0,
"completions/max_terminated_length": 1942.0,
"completions/mean_length": 895.890625,
"completions/mean_terminated_length": 902.9448852539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 542.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.37633347511291504,
"learning_rate": 3.611111111111111e-07,
"loss": 0.0081,
"num_tokens": 57470225.0,
"reward": 1.4211552143096924,
"reward_std": 0.27990084886550903,
"rewards/accuracy_reward_step": 0.71875,
"rewards/final_brier_reward_step": 0.795285165309906,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.8237380981445312,
"step": 188
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 10.16796875,
"calib/ece": 0.21062745098039215,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7176470588235294,
"calib/gap": 0.15619883040935656,
"calib/mean_conf": 0.878078431372549,
"calib/mu_c": 0.92953216374269,
"calib/mu_w": 0.7733333333333334,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20905882352941177,
"calib/std_conf": 0.2113370918640157,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1918.0,
"completions/max_terminated_length": 1918.0,
"completions/mean_length": 867.20703125,
"completions/mean_terminated_length": 874.035400390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 546.0,
"epoch": 0.2016,
"grad_norm": 0.3926154375076294,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0016,
"num_tokens": 57823038.0,
"reward": 1.3388947248458862,
"reward_std": 0.3091922998428345,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7508214712142944,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.7883110046386719,
"step": 189
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 9.83203125,
"calib/ece": 0.20427419354838708,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.75,
"calib/gap": 0.26648640551269565,
"calib/mean_conf": 0.8783064516129032,
"calib/mu_c": 0.9631952662721893,
"calib/mu_w": 0.6967088607594937,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20056451612903223,
"calib/std_conf": 0.2390664699516661,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2343.0,
"completions/max_terminated_length": 2343.0,
"completions/mean_length": 892.70703125,
"completions/mean_terminated_length": 917.8031616210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 546.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.3378767669200897,
"learning_rate": 3.055555555555556e-07,
"loss": -0.0152,
"num_tokens": 58180219.0,
"reward": 1.346990704536438,
"reward_std": 0.2692444920539856,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7776304483413696,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.8045772314071655,
"step": 190
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 10.078125,
"calib/ece": 0.27876,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.752,
"calib/gap": 0.17710599016238815,
"calib/mean_conf": 0.89076,
"calib/mu_c": 0.9594771241830066,
"calib/mu_w": 0.7823711340206184,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27876,
"calib/std_conf": 0.20569254337481466,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2427.0,
"completions/max_terminated_length": 2427.0,
"completions/mean_length": 877.078125,
"completions/mean_terminated_length": 898.1280517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 505.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.4926356077194214,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.0411,
"num_tokens": 58531959.0,
"reward": 1.2263085842132568,
"reward_std": 0.286679208278656,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7017941474914551,
"rewards/format_reward_step": 0.96875,
"rewards/stepwise_brier_reward": 0.7235209345817566,
"step": 191
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 9.90234375,
"calib/ece": 0.18468,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.784,
"calib/gap": 0.17026425250795207,
"calib/mean_conf": 0.89732,
"calib/mu_c": 0.9429508196721311,
"calib/mu_w": 0.772686567164179,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17500000000000002,
"calib/std_conf": 0.20747148623365091,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 1855.0,
"completions/max_terminated_length": 1855.0,
"completions/mean_length": 889.34375,
"completions/mean_terminated_length": 910.6880493164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 404.0,
"epoch": 0.2048,
"grad_norm": 0.39645713567733765,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0324,
"num_tokens": 58887647.0,
"reward": 1.3959075212478638,
"reward_std": 0.282784640789032,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7707542777061462,
"rewards/format_reward_step": 0.96484375,
"rewards/stepwise_brier_reward": 0.7968087196350098,
"step": 192
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 9.890625,
"calib/ece": 0.25155102040816335,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.7918367346938775,
"calib/gap": 0.16522256728778462,
"calib/mean_conf": 0.9086938775510204,
"calib/mu_c": 0.9653416149068323,
"calib/mu_w": 0.8001190476190477,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25155102040816335,
"calib/std_conf": 0.1963377144763097,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 1972.0,
"completions/max_terminated_length": 1972.0,
"completions/mean_length": 879.12109375,
"completions/mean_terminated_length": 918.591796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 573.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.4955199956893921,
"learning_rate": 2.2222222222222224e-07,
"loss": -0.0345,
"num_tokens": 59241454.0,
"reward": 1.2673017978668213,
"reward_std": 0.43317940831184387,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7152073979377747,
"rewards/format_reward_step": 0.95703125,
"rewards/stepwise_brier_reward": 0.7403548955917358,
"step": 193
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 10.1875,
"calib/ece": 0.21822134387351774,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7193675889328063,
"calib/gap": 0.23310701562071812,
"calib/mean_conf": 0.8596442687747036,
"calib/mu_c": 0.9416463414634146,
"calib/mu_w": 0.7085393258426965,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21482213438735176,
"calib/std_conf": 0.247046330200576,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1501.0,
"completions/max_terminated_length": 1501.0,
"completions/mean_length": 906.625,
"completions/mean_terminated_length": 921.0159301757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 590.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.4592355787754059,
"learning_rate": 1.9444444444444447e-07,
"loss": -0.0213,
"num_tokens": 59602534.0,
"reward": 1.322446584701538,
"reward_std": 0.2598746418952942,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.763495683670044,
"rewards/format_reward_step": 0.98828125,
"rewards/stepwise_brier_reward": 0.8049821853637695,
"step": 194
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 10.02734375,
"calib/ece": 0.28528225806451624,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.7862903225806451,
"calib/gap": 0.17409643547941434,
"calib/mean_conf": 0.8984274193548387,
"calib/mu_c": 0.9644155844155845,
"calib/mu_w": 0.7903191489361702,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2813709677419356,
"calib/std_conf": 0.21392724394006452,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1850.0,
"completions/max_terminated_length": 1850.0,
"completions/mean_length": 873.3046875,
"completions/mean_terminated_length": 901.4757690429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 484.0,
"epoch": 0.208,
"grad_norm": 0.4438525140285492,
"learning_rate": 1.6666666666666668e-07,
"loss": -0.0591,
"num_tokens": 59955124.0,
"reward": 1.2250550985336304,
"reward_std": 0.3435608744621277,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6959050893783569,
"rewards/format_reward_step": 0.9609375,
"rewards/stepwise_brier_reward": 0.7177851796150208,
"step": 195
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 10.67578125,
"calib/ece": 0.237992125984252,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.8622047244094488,
"calib/gap": 0.1403933933933933,
"calib/mean_conf": 0.9466535433070866,
"calib/mu_c": 0.9875555555555555,
"calib/mu_w": 0.8471621621621622,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.237992125984252,
"calib/std_conf": 0.14324220323165088,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1633.0,
"completions/max_terminated_length": 1633.0,
"completions/mean_length": 852.4296875,
"completions/mean_terminated_length": 859.1417236328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 584.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.370975524187088,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.0231,
"num_tokens": 60298930.0,
"reward": 1.385732650756836,
"reward_std": 0.22724959254264832,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7683027386665344,
"rewards/format_reward_step": 0.9921875,
"rewards/stepwise_brier_reward": 0.7969499826431274,
"step": 196
},
{
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 10.14453125,
"calib/ece": 0.3686111111111112,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.8055555555555556,
"calib/gap": 0.07068238451217168,
"calib/mean_conf": 0.9190079365079364,
"calib/mu_c": 0.9501418439716312,
"calib/mu_w": 0.8794594594594595,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3640476190476192,
"calib/std_conf": 0.17895920418466515,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1786.0,
"completions/max_terminated_length": 1786.0,
"completions/mean_length": 915.42578125,
"completions/mean_terminated_length": 933.661376953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 549.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.7663440704345703,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0453,
"num_tokens": 60661375.0,
"reward": 1.1283903121948242,
"reward_std": 0.38009026646614075,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.6173292398452759,
"rewards/format_reward_step": 0.984375,
"rewards/stepwise_brier_reward": 0.6820275187492371,
"step": 197
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 10.44140625,
"calib/ece": 0.19290196078431368,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.796078431372549,
"calib/gap": 0.16206284153005468,
"calib/mean_conf": 0.9104705882352941,
"calib/mu_c": 0.9562295081967214,
"calib/mu_w": 0.7941666666666667,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1928627450980392,
"calib/std_conf": 0.19217989457809093,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2108.0,
"completions/max_terminated_length": 2108.0,
"completions/mean_length": 894.81640625,
"completions/mean_terminated_length": 901.8621826171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 508.0,
"epoch": 0.2112,
"grad_norm": 0.9934461712837219,
"learning_rate": 8.333333333333334e-08,
"loss": -0.0034,
"num_tokens": 61018872.0,
"reward": 1.4082475900650024,
"reward_std": 0.3323134183883667,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7858519554138184,
"rewards/format_reward_step": 0.99609375,
"rewards/stepwise_brier_reward": 0.803473711013794,
"step": 198
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 10.34375,
"calib/ece": 0.22410358565737049,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.8087649402390438,
"calib/gap": 0.1680443609022556,
"calib/mean_conf": 0.9133466135458167,
"calib/mu_c": 0.9642285714285714,
"calib/mu_w": 0.7961842105263158,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22011952191235057,
"calib/std_conf": 0.1921858832578285,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2233.0,
"completions/max_terminated_length": 2233.0,
"completions/mean_length": 915.91015625,
"completions/mean_terminated_length": 934.1553955078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 577.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.48377180099487305,
"learning_rate": 5.555555555555556e-08,
"loss": -0.0161,
"num_tokens": 61380585.0,
"reward": 1.3653557300567627,
"reward_std": 0.35379722714424133,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7610331773757935,
"rewards/format_reward_step": 0.98046875,
"rewards/stepwise_brier_reward": 0.8127938508987427,
"step": 199
},
{
"calib/answer_extract_rate": 0.953125,
"calib/avg_num_step_conf": 10.03515625,
"calib/ece": 0.2200819672131148,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.7909836065573771,
"calib/gap": 0.17872466554514133,
"calib/mean_conf": 0.9123770491803279,
"calib/mu_c": 0.9658479532163743,
"calib/mu_w": 0.787123287671233,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21581967213114756,
"calib/std_conf": 0.1854460872505499,
"calib/step_conf_rate": 1.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2085.0,
"completions/max_terminated_length": 2085.0,
"completions/mean_length": 858.55078125,
"completions/mean_terminated_length": 900.7745361328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 605.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.3373366892337799,
"learning_rate": 2.777777777777778e-08,
"loss": -0.0863,
"num_tokens": 61731462.0,
"reward": 1.3362358808517456,
"reward_std": 0.3129503130912781,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7492789030075073,
"rewards/format_reward_step": 0.953125,
"rewards/stepwise_brier_reward": 0.7932606935501099,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.07741938081453555,
"train_runtime": 10541.1044,
"train_samples_per_second": 4.857,
"train_steps_per_second": 0.019
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 61731462,
"num_train_epochs": 1,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}