Files
PureRL-1.5B-v7-s2-async-l2-…/trainer_state.json
ModelHub XC 881798b83f 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-async-l2-maskoff-afew
Source: Original Platform
2026-06-02 08:45:21 +08:00

9843 lines
383 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5086206896551725,
"calib/avg_num_step_conf": 7.875,
"calib/ece": 0.2888991935483871,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0001713264989126051,
"calib/mean_conf": 0.9905120967741936,
"calib/mu_c": 0.9905632183908043,
"calib/mu_w": 0.9903918918918917,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2888991935483871,
"calib/std_conf": 0.0021794159006610276,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9119477557027226,
"calib/step_q_c_n": 1359.0,
"calib/step_q_gap": 0.0056311651395566376,
"calib/step_q_w": 0.9063165905631659,
"calib/step_q_w_n": 657.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2494.0,
"completions/max_terminated_length": 2494.0,
"completions/mean_length": 755.49609375,
"completions/mean_terminated_length": 776.7349243164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 397.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.00794170517474413,
"kl": 0.0005849599838256836,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0572,
"num_tokens": 300991.0,
"reward": 0.8751538991928101,
"reward_std": 0.2377150058746338,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/asymmetric_l2_reward": 0.7354698181152344,
"rewards/final_brier_reward_step": 0.6851503849029541,
"rewards/format_reward_step": 0.96875,
"step": 1
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4872611464968153,
"calib/avg_num_step_conf": 7.6953125,
"calib/ece": 0.36465737051792824,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00024481637078155316,
"calib/mean_conf": 0.9901553784860557,
"calib/mu_c": 0.990063694267516,
"calib/mu_w": 0.9903085106382975,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36465737051792824,
"calib/std_conf": 0.001222205307190084,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9075405636208369,
"calib/step_q_c_n": 1171.0,
"calib/step_q_gap": -0.003804868168900244,
"calib/step_q_w": 0.9113454317897371,
"calib/step_q_w_n": 799.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2743.0,
"completions/max_terminated_length": 2743.0,
"completions/mean_length": 840.640625,
"completions/mean_terminated_length": 850.6087036132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 466.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.006795755121856928,
"kl": 0.0016820430755615234,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0266,
"num_tokens": 619483.0,
"reward": 0.8191705346107483,
"reward_std": 0.21779605746269226,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/asymmetric_l2_reward": 0.6992889642715454,
"rewards/final_brier_reward_step": 0.6203019618988037,
"rewards/format_reward_step": 0.98046875,
"step": 2
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4995745887691435,
"calib/avg_num_step_conf": 7.703125,
"calib/ece": 0.31306692913385814,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 8.508224618175575e-07,
"calib/mean_conf": 0.9902322834645668,
"calib/mu_c": 0.9902325581395348,
"calib/mu_w": 0.990231707317073,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31306692913385814,
"calib/std_conf": 0.0014944718019728367,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9121462264150944,
"calib/step_q_c_n": 1272.0,
"calib/step_q_gap": -0.0028009164420484955,
"calib/step_q_w": 0.9149471428571428,
"calib/step_q_w_n": 700.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2224.0,
"completions/max_terminated_length": 2224.0,
"completions/mean_length": 802.05859375,
"completions/mean_terminated_length": 805.2039794921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 431.0,
"epoch": 0.0032,
"grad_norm": 0.0075466628186404705,
"kl": 0.0005320906639099121,
"learning_rate": 7.5e-07,
"loss": 0.0078,
"num_tokens": 930066.0,
"reward": 0.8693341016769409,
"reward_std": 0.2241458147764206,
"rewards/accuracy_reward_step": 0.671875,
"rewards/asymmetric_l2_reward": 0.7278196811676025,
"rewards/final_brier_reward_step": 0.6780359148979187,
"rewards/format_reward_step": 0.9921875,
"step": 3
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5029940119760479,
"calib/avg_num_step_conf": 8.01953125,
"calib/ece": 0.3273412698412699,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 5.988023952108179e-05,
"calib/mean_conf": 0.9900396825396826,
"calib/mu_c": 0.9900598802395209,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3273412698412699,
"calib/std_conf": 0.0006286896634029713,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9098512137823024,
"calib/step_q_c_n": 1277.0,
"calib/step_q_gap": -0.007713219207388322,
"calib/step_q_w": 0.9175644329896907,
"calib/step_q_w_n": 776.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2714.0,
"completions/max_terminated_length": 2714.0,
"completions/mean_length": 781.55859375,
"completions/mean_terminated_length": 793.96435546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 367.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.007215971127152443,
"kl": 0.0005492568016052246,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0391,
"num_tokens": 1236313.0,
"reward": 0.8469743728637695,
"reward_std": 0.21102729439735413,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/asymmetric_l2_reward": 0.707718551158905,
"rewards/final_brier_reward_step": 0.6588863134384155,
"rewards/format_reward_step": 0.984375,
"step": 4
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5189616967394746,
"calib/avg_num_step_conf": 7.52734375,
"calib/ece": 0.4544007936507938,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00037435897435889043,
"calib/mean_conf": 0.9901150793650795,
"calib/mu_c": 0.9902888888888887,
"calib/mu_w": 0.9899145299145298,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4544007936507938,
"calib/std_conf": 0.0013767668279207968,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9129371980676328,
"calib/step_q_c_n": 1035.0,
"calib/step_q_gap": 0.011614328112475869,
"calib/step_q_w": 0.901322869955157,
"calib/step_q_w_n": 892.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2478.0,
"completions/max_terminated_length": 2478.0,
"completions/mean_length": 822.24609375,
"completions/mean_terminated_length": 831.99609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 451.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.006809460464864969,
"kl": 0.0006694197654724121,
"learning_rate": 1.25e-06,
"loss": -0.0171,
"num_tokens": 1553496.0,
"reward": 0.7390369772911072,
"reward_std": 0.20035284757614136,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/asymmetric_l2_reward": 0.6392657160758972,
"rewards/final_brier_reward_step": 0.5364644527435303,
"rewards/format_reward_step": 0.984375,
"step": 5
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.502243039978889,
"calib/avg_num_step_conf": 8.32421875,
"calib/ece": 0.4158192771084338,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 3.852751022559442e-05,
"calib/mean_conf": 0.9901164658634539,
"calib/mu_c": 0.990132867132867,
"calib/mu_w": 0.9900943396226414,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4158192771084338,
"calib/std_conf": 0.001055911813966895,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9112607573149744,
"calib/step_q_c_n": 1162.0,
"calib/step_q_gap": -0.006603019774809038,
"calib/step_q_w": 0.9178637770897834,
"calib/step_q_w_n": 969.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2701.0,
"completions/max_terminated_length": 2701.0,
"completions/mean_length": 739.35546875,
"completions/mean_terminated_length": 748.12255859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 449.0,
"epoch": 0.0064,
"grad_norm": 0.007401874754577875,
"kl": 0.000727236270904541,
"learning_rate": 1.5e-06,
"loss": 0.01,
"num_tokens": 1848723.0,
"reward": 0.7569711208343506,
"reward_std": 0.20083239674568176,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/asymmetric_l2_reward": 0.6409914493560791,
"rewards/final_brier_reward_step": 0.5667007565498352,
"rewards/format_reward_step": 0.97265625,
"step": 6
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.489010989010989,
"calib/avg_num_step_conf": 7.2109375,
"calib/ece": 0.35118650793650796,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00020879120879124802,
"calib/mean_conf": 0.9900753968253968,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9902087912087911,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.35118650793650796,
"calib/std_conf": 0.0008441381918702945,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9099402220324508,
"calib/step_q_c_n": 1171.0,
"calib/step_q_gap": 0.0027742961065249094,
"calib/step_q_w": 0.9071659259259259,
"calib/step_q_w_n": 675.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2497.0,
"completions/max_terminated_length": 2497.0,
"completions/mean_length": 868.70703125,
"completions/mean_terminated_length": 872.11376953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 423.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.007204503286629915,
"kl": 0.0005515217781066895,
"learning_rate": 1.75e-06,
"loss": 0.0165,
"num_tokens": 2178536.0,
"reward": 0.8298871517181396,
"reward_std": 0.2348572164773941,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/asymmetric_l2_reward": 0.701348602771759,
"rewards/final_brier_reward_step": 0.6357694864273071,
"rewards/format_reward_step": 0.984375,
"step": 7
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5087550028587764,
"calib/avg_num_step_conf": 7.56640625,
"calib/ece": 0.34198380566801617,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9919028340080972,
"calib/gap": 0.012221983990851504,
"calib/mean_conf": 0.9857085020242914,
"calib/mu_c": 0.9900628930817608,
"calib/mu_w": 0.9778409090909093,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34198380566801617,
"calib/std_conf": 0.06311261742570078,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9095822890559733,
"calib/step_q_c_n": 1197.0,
"calib/step_q_gap": 0.00013634311002730382,
"calib/step_q_w": 0.909445945945946,
"calib/step_q_w_n": 740.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2612.0,
"completions/max_terminated_length": 2612.0,
"completions/mean_length": 835.70703125,
"completions/mean_terminated_length": 852.3546142578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 397.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.006909141317009926,
"kl": 0.0005980730056762695,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0162,
"num_tokens": 2498989.0,
"reward": 0.826758086681366,
"reward_std": 0.1921631246805191,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/asymmetric_l2_reward": 0.7040407657623291,
"rewards/final_brier_reward_step": 0.632287859916687,
"rewards/format_reward_step": 0.96484375,
"step": 8
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.49755799755799757,
"calib/avg_num_step_conf": 8.44921875,
"calib/ece": 0.2472653061224489,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -4.8840048840537165e-05,
"calib/mean_conf": 0.9901224489795918,
"calib/mu_c": 0.9901098901098899,
"calib/mu_w": 0.9901587301587305,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2472653061224489,
"calib/std_conf": 0.0010997709049230609,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8988439716312056,
"calib/step_q_c_n": 1410.0,
"calib/step_q_gap": 0.0900657511796783,
"calib/step_q_w": 0.8087782204515273,
"calib/step_q_w_n": 753.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 3003.0,
"completions/max_terminated_length": 3003.0,
"completions/mean_length": 776.78515625,
"completions/mean_terminated_length": 808.3617553710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 430.0,
"epoch": 0.0096,
"grad_norm": 0.007497426588088274,
"kl": 0.0006467700004577637,
"learning_rate": 2.25e-06,
"loss": -0.0483,
"num_tokens": 2805382.0,
"reward": 0.883492112159729,
"reward_std": 0.24604354798793793,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/asymmetric_l2_reward": 0.7177037000656128,
"rewards/final_brier_reward_step": 0.7156867384910583,
"rewards/format_reward_step": 0.95703125,
"step": 9
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.49134020618556706,
"calib/avg_num_step_conf": 7.6875,
"calib/ece": 0.3828906882591092,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00011408934707923812,
"calib/mean_conf": 0.9901781376518217,
"calib/mu_c": 0.9901333333333332,
"calib/mu_w": 0.9902474226804124,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3828906882591092,
"calib/std_conf": 0.0012696423407242182,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9113660245183888,
"calib/step_q_c_n": 1142.0,
"calib/step_q_gap": 0.005036726697565608,
"calib/step_q_w": 0.9063292978208232,
"calib/step_q_w_n": 826.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2945.0,
"completions/max_terminated_length": 2945.0,
"completions/mean_length": 844.6796875,
"completions/mean_terminated_length": 861.5059814453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 475.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.007133356295526028,
"kl": 0.0006768107414245605,
"learning_rate": 2.5e-06,
"loss": 0.0149,
"num_tokens": 3128420.0,
"reward": 0.7908077239990234,
"reward_std": 0.26634955406188965,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/asymmetric_l2_reward": 0.678225576877594,
"rewards/final_brier_reward_step": 0.5932334661483765,
"rewards/format_reward_step": 0.96484375,
"step": 10
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.49809211407149556,
"calib/avg_num_step_conf": 7.453125,
"calib/ece": 0.3765338645418326,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -3.815771857029748e-05,
"calib/mean_conf": 0.9900796812749003,
"calib/mu_c": 0.9900649350649349,
"calib/mu_w": 0.9901030927835052,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3765338645418326,
"calib/std_conf": 0.0008890802232837218,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9106165540540541,
"calib/step_q_c_n": 1184.0,
"calib/step_q_gap": 0.00452539383305961,
"calib/step_q_w": 0.9060911602209945,
"calib/step_q_w_n": 724.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2988.0,
"completions/max_terminated_length": 2988.0,
"completions/mean_length": 862.33203125,
"completions/mean_terminated_length": 869.1220703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 424.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.006749256979674101,
"kl": 0.0007430911064147949,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0249,
"num_tokens": 3453657.0,
"reward": 0.817877471446991,
"reward_std": 0.18740509450435638,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/asymmetric_l2_reward": 0.7112424373626709,
"rewards/final_brier_reward_step": 0.6088874340057373,
"rewards/format_reward_step": 0.9765625,
"step": 11
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.4921582605596151,
"calib/avg_num_step_conf": 8.31640625,
"calib/ece": 0.24534979423868308,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00015683478880823554,
"calib/mean_conf": 0.9902057613168724,
"calib/mu_c": 0.9901657458563534,
"calib/mu_w": 0.9903225806451617,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24534979423868308,
"calib/std_conf": 0.001419603976186038,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9127608554763449,
"calib/step_q_c_n": 1543.0,
"calib/step_q_gap": 0.012481845237436895,
"calib/step_q_w": 0.900279010238908,
"calib/step_q_w_n": 586.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2540.0,
"completions/max_terminated_length": 2540.0,
"completions/mean_length": 789.0234375,
"completions/mean_terminated_length": 811.2047729492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 329.0,
"epoch": 0.0128,
"grad_norm": 0.007351956330239773,
"kl": 0.0010945796966552734,
"learning_rate": 3e-06,
"loss": -0.0461,
"num_tokens": 3759823.0,
"reward": 0.9017115831375122,
"reward_std": 0.22140005230903625,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/asymmetric_l2_reward": 0.7605472803115845,
"rewards/final_brier_reward_step": 0.7116257548332214,
"rewards/format_reward_step": 0.94921875,
"step": 12
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.49408326204733394,
"calib/avg_num_step_conf": 8.19921875,
"calib/ece": 0.324820717131474,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00011833475905331792,
"calib/mean_conf": 0.9901593625498006,
"calib/mu_c": 0.9901197604790417,
"calib/mu_w": 0.990238095238095,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.324820717131474,
"calib/std_conf": 0.0012522895335061138,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9110180995475113,
"calib/step_q_c_n": 1326.0,
"calib/step_q_gap": 0.0008887334414311443,
"calib/step_q_w": 0.9101293661060802,
"calib/step_q_w_n": 773.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1971.0,
"completions/max_terminated_length": 1971.0,
"completions/mean_length": 825.59375,
"completions/mean_terminated_length": 838.698486328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 333.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.008176865056157112,
"kl": 0.010606169700622559,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.0137,
"num_tokens": 4075767.0,
"reward": 0.8516587018966675,
"reward_std": 0.20331385731697083,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/asymmetric_l2_reward": 0.7181015014648438,
"rewards/final_brier_reward_step": 0.6586534976959229,
"rewards/format_reward_step": 0.98046875,
"step": 13
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5079258841764125,
"calib/avg_num_step_conf": 7.65234375,
"calib/ece": 0.3738775510204082,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00015851768352848072,
"calib/mean_conf": 0.990204081632653,
"calib/mu_c": 0.9902649006622515,
"calib/mu_w": 0.990106382978723,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3738775510204082,
"calib/std_conf": 0.00141391902658684,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9120733788395907,
"calib/step_q_c_n": 1172.0,
"calib/step_q_gap": -0.00030273297743588223,
"calib/step_q_w": 0.9123761118170266,
"calib/step_q_w_n": 787.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2835.0,
"completions/max_terminated_length": 2835.0,
"completions/mean_length": 861.015625,
"completions/mean_terminated_length": 888.790283203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 460.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.006741903256624937,
"kl": 0.0025125741958618164,
"learning_rate": 3.5e-06,
"loss": -0.0444,
"num_tokens": 4401587.0,
"reward": 0.800338864326477,
"reward_std": 0.23323848843574524,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/asymmetric_l2_reward": 0.6942870616912842,
"rewards/final_brier_reward_step": 0.5970156192779541,
"rewards/format_reward_step": 0.95703125,
"step": 14
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.48835025733759907,
"calib/avg_num_step_conf": 8.3828125,
"calib/ece": 0.3558634538152611,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0002329948532481252,
"calib/mean_conf": 0.9904016064257029,
"calib/mu_c": 0.9903164556962023,
"calib/mu_w": 0.9905494505494504,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3558634538152611,
"calib/std_conf": 0.001963358483788004,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9112529002320185,
"calib/step_q_c_n": 1293.0,
"calib/step_q_gap": -0.004925294375249911,
"calib/step_q_w": 0.9161781946072685,
"calib/step_q_w_n": 853.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2379.0,
"completions/max_terminated_length": 2379.0,
"completions/mean_length": 774.71484375,
"completions/mean_terminated_length": 796.4939575195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 431.0,
"epoch": 0.016,
"grad_norm": 0.006958819925785065,
"kl": 0.003179311752319336,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0364,
"num_tokens": 4707794.0,
"reward": 0.8193666934967041,
"reward_std": 0.1843802034854889,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/asymmetric_l2_reward": 0.6969516277313232,
"rewards/final_brier_reward_step": 0.6238129138946533,
"rewards/format_reward_step": 0.97265625,
"step": 15
},
{
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.5129361324659965,
"calib/avg_num_step_conf": 8.22265625,
"calib/ece": 0.35505394190871375,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.991701244813278,
"calib/gap": 0.012177927261975197,
"calib/mean_conf": 0.9857593360995851,
"calib/mu_c": 0.9902565789473684,
"calib/mu_w": 0.9780786516853932,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.35505394190871375,
"calib/std_conf": 0.06391215578941556,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.912226787181594,
"calib/step_q_c_n": 1217.0,
"calib/step_q_gap": 0.011711021415828138,
"calib/step_q_w": 0.9005157657657659,
"calib/step_q_w_n": 888.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 3068.0,
"completions/max_terminated_length": 3068.0,
"completions/mean_length": 934.87890625,
"completions/mean_terminated_length": 968.943359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 511.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.005922108888626099,
"kl": 0.003998517990112305,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0454,
"num_tokens": 5055971.0,
"reward": 0.7862738370895386,
"reward_std": 0.22128784656524658,
"rewards/accuracy_reward_step": 0.59375,
"rewards/asymmetric_l2_reward": 0.6605606079101562,
"rewards/final_brier_reward_step": 0.6049558520317078,
"rewards/format_reward_step": 0.94140625,
"step": 16
},
{
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.4801462904911181,
"calib/avg_num_step_conf": 8.56640625,
"calib/ece": 0.2651666666666668,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0003970741901777597,
"calib/mean_conf": 0.9901666666666668,
"calib/mu_c": 0.9900574712643677,
"calib/mu_w": 0.9904545454545455,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2651666666666668,
"calib/std_conf": 0.001280190957978102,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9115190735694824,
"calib/step_q_c_n": 1468.0,
"calib/step_q_gap": 0.009091487362585804,
"calib/step_q_w": 0.9024275862068966,
"calib/step_q_w_n": 725.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3072.0,
"completions/mean_length": 881.22265625,
"completions/mean_terminated_length": 917.044677734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 442.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.006506393197923899,
"kl": 0.006697654724121094,
"learning_rate": 4.25e-06,
"loss": -0.0019,
"num_tokens": 5385092.0,
"reward": 0.8756923675537109,
"reward_std": 0.2749893069267273,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/asymmetric_l2_reward": 0.743429958820343,
"rewards/final_brier_reward_step": 0.6845171451568604,
"rewards/format_reward_step": 0.9375,
"step": 17
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.49460734748904617,
"calib/avg_num_step_conf": 8.52734375,
"calib/ece": 0.4615122950819672,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00011358274351180864,
"calib/mean_conf": 0.9902008196721311,
"calib/mu_c": 0.9901472868217055,
"calib/mu_w": 0.9902608695652173,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4615122950819672,
"calib/std_conf": 0.0013895981198515555,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9050502164502165,
"calib/step_q_c_n": 1155.0,
"calib/step_q_gap": -0.00048480300503650486,
"calib/step_q_w": 0.905535019455253,
"calib/step_q_w_n": 1028.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2863.0,
"completions/max_terminated_length": 2863.0,
"completions/mean_length": 878.26953125,
"completions/mean_terminated_length": 917.7020263671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 496.0,
"epoch": 0.0192,
"grad_norm": 0.006810254883021116,
"kl": 0.009805679321289062,
"learning_rate": 4.5e-06,
"loss": -0.0829,
"num_tokens": 5720649.0,
"reward": 0.7003229856491089,
"reward_std": 0.2627100646495819,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/asymmetric_l2_reward": 0.5975357294082642,
"rewards/final_brier_reward_step": 0.512485146522522,
"rewards/format_reward_step": 0.94921875,
"step": 18
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5020604395604397,
"calib/avg_num_step_conf": 8.515625,
"calib/ece": 0.4044422310756972,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 8.797749869160487e-05,
"calib/mean_conf": 0.9900996015936255,
"calib/mu_c": 0.9901360544217686,
"calib/mu_w": 0.990048076923077,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4044422310756972,
"calib/std_conf": 0.0009415380317208354,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9119737919737919,
"calib/step_q_c_n": 1221.0,
"calib/step_q_gap": 0.00832937070163342,
"calib/step_q_w": 0.9036444212721585,
"calib/step_q_w_n": 959.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2840.0,
"completions/max_terminated_length": 2840.0,
"completions/mean_length": 889.046875,
"completions/mean_terminated_length": 906.7570190429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 488.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.00690700626000762,
"kl": 0.0130615234375,
"learning_rate": 4.75e-06,
"loss": -0.029,
"num_tokens": 6053005.0,
"reward": 0.7996513843536377,
"reward_std": 0.1983795017004013,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/asymmetric_l2_reward": 0.7061575651168823,
"rewards/final_brier_reward_step": 0.5822077393531799,
"rewards/format_reward_step": 0.98046875,
"step": 19
},
{
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.5141586360266864,
"calib/avg_num_step_conf": 8.92578125,
"calib/ece": 0.3908691983122362,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.9957805907172996,
"calib/gap": 0.0011315048183840881,
"calib/mean_conf": 0.9900253164556961,
"calib/mu_c": 0.9904788732394364,
"calib/mu_w": 0.9893473684210523,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3908691983122362,
"calib/std_conf": 0.006170519301011835,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9144237160120847,
"calib/step_q_c_n": 1324.0,
"calib/step_q_gap": 0.014782717052667427,
"calib/step_q_w": 0.8996409989594173,
"calib/step_q_w_n": 961.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2904.0,
"completions/max_terminated_length": 2904.0,
"completions/mean_length": 878.88671875,
"completions/mean_terminated_length": 922.110595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 476.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.006423440296202898,
"kl": 0.017368316650390625,
"learning_rate": 5e-06,
"loss": -0.0475,
"num_tokens": 6382872.0,
"reward": 0.7497608661651611,
"reward_std": 0.25458869338035583,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/asymmetric_l2_reward": 0.6401803493499756,
"rewards/final_brier_reward_step": 0.5624663829803467,
"rewards/format_reward_step": 0.92578125,
"step": 20
},
{
"calib/answer_extract_rate": 0.92578125,
"calib/auroc": 0.5082817337461301,
"calib/avg_num_step_conf": 9.05859375,
"calib/ece": 0.34897046413502086,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0001698142414863213,
"calib/mean_conf": 0.990320675105485,
"calib/mu_c": 0.9903815789473684,
"calib/mu_w": 0.9902117647058821,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34897046413502086,
"calib/std_conf": 0.0017181442888759794,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9113709063214015,
"calib/step_q_c_n": 1313.0,
"calib/step_q_gap": -0.004332871014582684,
"calib/step_q_w": 0.9157037773359842,
"calib/step_q_w_n": 1006.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2425.0,
"completions/max_terminated_length": 2425.0,
"completions/mean_length": 862.90625,
"completions/mean_terminated_length": 928.1680908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 486.0,
"epoch": 0.0224,
"grad_norm": 0.006447064224630594,
"kl": 0.019985198974609375,
"learning_rate": 4.9722222222222224e-06,
"loss": -0.1212,
"num_tokens": 6706736.0,
"reward": 0.7912242412567139,
"reward_std": 0.337002694606781,
"rewards/accuracy_reward_step": 0.59375,
"rewards/asymmetric_l2_reward": 0.6783818006515503,
"rewards/final_brier_reward_step": 0.600160539150238,
"rewards/format_reward_step": 0.92578125,
"step": 21
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.4864224137931035,
"calib/avg_num_step_conf": 9.453125,
"calib/ece": 0.34248987854251,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.000250287356321377,
"calib/mean_conf": 0.9902631578947367,
"calib/mu_c": 0.990175,
"calib/mu_w": 0.9904252873563214,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34248987854251,
"calib/std_conf": 0.0015427704414423698,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9139300554016622,
"calib/step_q_c_n": 1444.0,
"calib/step_q_gap": -0.008908059352436215,
"calib/step_q_w": 0.9228381147540984,
"calib/step_q_w_n": 976.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2716.0,
"completions/max_terminated_length": 2716.0,
"completions/mean_length": 926.14453125,
"completions/mean_terminated_length": 952.1806640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 554.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.006909825373440981,
"kl": 0.02556610107421875,
"learning_rate": 4.944444444444445e-06,
"loss": -0.0571,
"num_tokens": 7045645.0,
"reward": 0.8283920884132385,
"reward_std": 0.2065667062997818,
"rewards/accuracy_reward_step": 0.625,
"rewards/asymmetric_l2_reward": 0.7074013948440552,
"rewards/final_brier_reward_step": 0.631413996219635,
"rewards/format_reward_step": 0.96484375,
"step": 22
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.48486980999296275,
"calib/avg_num_step_conf": 9.1875,
"calib/ece": 0.3936090534979424,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0002897959183671972,
"calib/mean_conf": 0.9903168724279835,
"calib/mu_c": 0.9902000000000001,
"calib/mu_w": 0.9904897959183673,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3936090534979424,
"calib/std_conf": 0.0017196525908183033,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9122642244738894,
"calib/step_q_c_n": 1283.0,
"calib/step_q_gap": -0.00529237047466069,
"calib/step_q_w": 0.9175565949485501,
"calib/step_q_w_n": 1069.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3040.0,
"completions/max_terminated_length": 3040.0,
"completions/mean_length": 991.2578125,
"completions/mean_terminated_length": 1023.2338256835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 576.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.006668829824775457,
"kl": 0.026376724243164062,
"learning_rate": 4.9166666666666665e-06,
"loss": -0.0238,
"num_tokens": 7403343.0,
"reward": 0.7600979208946228,
"reward_std": 0.28747719526290894,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/asymmetric_l2_reward": 0.6435527801513672,
"rewards/final_brier_reward_step": 0.5735179781913757,
"rewards/format_reward_step": 0.9453125,
"step": 23
},
{
"calib/answer_extract_rate": 0.875,
"calib/auroc": 0.5044955044955045,
"calib/avg_num_step_conf": 9.6875,
"calib/ece": 0.3985466367713004,
"calib/final_conf_rate": 0.87109375,
"calib/format_rate": 0.8671875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 8.010323010343434e-05,
"calib/mean_conf": 0.9904748878923767,
"calib/mu_c": 0.9905075757575759,
"calib/mu_w": 0.9904274725274724,
"calib/nonempty_final_conf_rate": 0.87109375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3985466367713004,
"calib/std_conf": 0.0020874592149258808,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9131759180187873,
"calib/step_q_c_n": 1171.0,
"calib/step_q_gap": 0.003358500142545906,
"calib/step_q_w": 0.9098174178762414,
"calib/step_q_w_n": 1309.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 3019.0,
"completions/max_terminated_length": 3019.0,
"completions/mean_length": 991.3203125,
"completions/mean_terminated_length": 1084.5213623046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 456.0,
"epoch": 0.0256,
"grad_norm": 0.006332829128950834,
"kl": 0.027338027954101562,
"learning_rate": 4.888888888888889e-06,
"loss": -0.0698,
"num_tokens": 7761633.0,
"reward": 0.7012392282485962,
"reward_std": 0.3621658682823181,
"rewards/accuracy_reward_step": 0.515625,
"rewards/asymmetric_l2_reward": 0.6074740886688232,
"rewards/final_brier_reward_step": 0.5184417963027954,
"rewards/format_reward_step": 0.8671875,
"step": 24
},
{
"calib/answer_extract_rate": 0.921875,
"calib/auroc": 0.5003185981680606,
"calib/avg_num_step_conf": 9.76171875,
"calib/ece": 0.3333389830508474,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.1469534050179142e-05,
"calib/mean_conf": 0.9901186440677966,
"calib/mu_c": 0.9901225806451612,
"calib/mu_w": 0.990111111111111,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3333389830508474,
"calib/std_conf": 0.0010469446387952413,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9102945791726105,
"calib/step_q_c_n": 1402.0,
"calib/step_q_gap": -0.00886950469247616,
"calib/step_q_w": 0.9191640838650866,
"calib/step_q_w_n": 1097.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3056.0,
"completions/max_terminated_length": 3056.0,
"completions/mean_length": 930.88671875,
"completions/mean_terminated_length": 992.9458618164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 627.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.00617326470091939,
"kl": 0.03643798828125,
"learning_rate": 4.861111111111111e-06,
"loss": -0.0607,
"num_tokens": 8103164.0,
"reward": 0.7968409061431885,
"reward_std": 0.26151537895202637,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/asymmetric_l2_reward": 0.6765774488449097,
"rewards/final_brier_reward_step": 0.6116355061531067,
"rewards/format_reward_step": 0.921875,
"step": 25
},
{
"calib/answer_extract_rate": 0.921875,
"calib/auroc": 0.49639751552795025,
"calib/avg_num_step_conf": 9.203125,
"calib/ece": 0.30787711864406775,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -7.743271221494208e-05,
"calib/mean_conf": 0.9900805084745762,
"calib/mu_c": 0.990055900621118,
"calib/mu_w": 0.990133333333333,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30787711864406775,
"calib/std_conf": 0.0008720478989503826,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.910840395480226,
"calib/step_q_c_n": 1416.0,
"calib/step_q_gap": 0.001616991224906772,
"calib/step_q_w": 0.9092234042553192,
"calib/step_q_w_n": 940.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2894.0,
"completions/max_terminated_length": 2894.0,
"completions/mean_length": 961.359375,
"completions/mean_terminated_length": 1025.4500732421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 586.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.006155260372906923,
"kl": 0.03830718994140625,
"learning_rate": 4.833333333333333e-06,
"loss": -0.0363,
"num_tokens": 8454512.0,
"reward": 0.8146965503692627,
"reward_std": 0.18639928102493286,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/asymmetric_l2_reward": 0.684640645980835,
"rewards/final_brier_reward_step": 0.6345961093902588,
"rewards/format_reward_step": 0.921875,
"step": 26
},
{
"calib/answer_extract_rate": 0.92578125,
"calib/auroc": 0.48558532323820613,
"calib/avg_num_step_conf": 9.10546875,
"calib/ece": 0.4161898734177215,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00041438555620243456,
"calib/mean_conf": 0.9900295358649789,
"calib/mu_c": 0.9898529411764706,
"calib/mu_w": 0.990267326732673,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4161898734177215,
"calib/std_conf": 0.0024673336271401705,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9105679012345679,
"calib/step_q_c_n": 1215.0,
"calib/step_q_gap": 0.005031162883313289,
"calib/step_q_w": 0.9055367383512546,
"calib/step_q_w_n": 1116.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2504.0,
"completions/max_terminated_length": 2504.0,
"completions/mean_length": 957.65234375,
"completions/mean_terminated_length": 1025.7698974609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 581.0,
"epoch": 0.0288,
"grad_norm": 0.006110661197453737,
"kl": 0.03726959228515625,
"learning_rate": 4.805555555555556e-06,
"loss": -0.0576,
"num_tokens": 8804887.0,
"reward": 0.7280486822128296,
"reward_std": 0.2960751950740814,
"rewards/accuracy_reward_step": 0.53125,
"rewards/asymmetric_l2_reward": 0.6258590817451477,
"rewards/final_brier_reward_step": 0.5388320088386536,
"rewards/format_reward_step": 0.92578125,
"step": 27
},
{
"calib/answer_extract_rate": 0.890625,
"calib/auroc": 0.5021139705882354,
"calib/avg_num_step_conf": 9.41796875,
"calib/ece": 0.28837280701754386,
"calib/final_conf_rate": 0.890625,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 7.647058823556652e-05,
"calib/mean_conf": 0.9901271929824561,
"calib/mu_c": 0.9901500000000001,
"calib/mu_w": 0.9900735294117645,
"calib/nonempty_final_conf_rate": 0.890625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.28837280701754386,
"calib/std_conf": 0.0009984887780795314,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9067275985663082,
"calib/step_q_c_n": 1395.0,
"calib/step_q_gap": -0.008336377811644535,
"calib/step_q_w": 0.9150639763779528,
"calib/step_q_w_n": 1016.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 2781.0,
"completions/max_terminated_length": 2781.0,
"completions/mean_length": 942.27734375,
"completions/mean_terminated_length": 1035.2918701171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 611.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.005902933422476053,
"kl": 0.036403656005859375,
"learning_rate": 4.777777777777778e-06,
"loss": -0.1197,
"num_tokens": 9153054.0,
"reward": 0.8076508045196533,
"reward_std": 0.25794512033462524,
"rewards/accuracy_reward_step": 0.625,
"rewards/asymmetric_l2_reward": 0.681990921497345,
"rewards/final_brier_reward_step": 0.6301857233047485,
"rewards/format_reward_step": 0.890625,
"step": 28
},
{
"calib/answer_extract_rate": 0.9140625,
"calib/auroc": 0.4990469208211144,
"calib/avg_num_step_conf": 8.796875,
"calib/ece": 0.46025213675213683,
"calib/final_conf_rate": 0.9140625,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.1436950146603841e-05,
"calib/mean_conf": 0.9901666666666668,
"calib/mu_c": 0.9901612903225805,
"calib/mu_w": 0.9901727272727271,
"calib/nonempty_final_conf_rate": 0.9140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.46025213675213683,
"calib/std_conf": 0.0012650799778778229,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9085794920037629,
"calib/step_q_c_n": 1063.0,
"calib/step_q_gap": -0.006252299417599749,
"calib/step_q_w": 0.9148317914213626,
"calib/step_q_w_n": 1189.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2987.0,
"completions/max_terminated_length": 2987.0,
"completions/mean_length": 1021.19921875,
"completions/mean_terminated_length": 1098.432861328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 611.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.005722293630242348,
"kl": 0.037353515625,
"learning_rate": 4.75e-06,
"loss": -0.0728,
"num_tokens": 9521609.0,
"reward": 0.6883062124252319,
"reward_std": 0.24335990846157074,
"rewards/accuracy_reward_step": 0.484375,
"rewards/asymmetric_l2_reward": 0.6041944026947021,
"rewards/final_brier_reward_step": 0.4927304685115814,
"rewards/format_reward_step": 0.9140625,
"step": 29
},
{
"calib/answer_extract_rate": 0.921875,
"calib/auroc": 0.5036021984551395,
"calib/avg_num_step_conf": 9.14453125,
"calib/ece": 0.41129361702127665,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 4.686571598344624e-05,
"calib/mean_conf": 0.9900170212765957,
"calib/mu_c": 0.9900367647058824,
"calib/mu_w": 0.989989898989899,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.41129361702127665,
"calib/std_conf": 0.0009361121838881834,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9080051369863015,
"calib/step_q_c_n": 1168.0,
"calib/step_q_gap": 0.004070780635065274,
"calib/step_q_w": 0.9039343563512362,
"calib/step_q_w_n": 1173.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2745.0,
"completions/max_terminated_length": 2745.0,
"completions/mean_length": 981.59765625,
"completions/mean_terminated_length": 1060.2911376953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 636.0,
"epoch": 0.032,
"grad_norm": 0.005487215239554644,
"kl": 0.039276123046875,
"learning_rate": 4.722222222222222e-06,
"loss": -0.1298,
"num_tokens": 9879882.0,
"reward": 0.7259608507156372,
"reward_std": 0.3174560070037842,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/asymmetric_l2_reward": 0.6232558488845825,
"rewards/final_brier_reward_step": 0.5388221740722656,
"rewards/format_reward_step": 0.9140625,
"step": 30
},
{
"calib/answer_extract_rate": 0.875,
"calib/auroc": 0.4976919339164237,
"calib/avg_num_step_conf": 8.4296875,
"calib/ece": 0.4276160714285714,
"calib/final_conf_rate": 0.875,
"calib/format_rate": 0.875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0001383219954648185,
"calib/mean_conf": 0.9901160714285714,
"calib/mu_c": 0.9900555555555555,
"calib/mu_w": 0.9901938775510203,
"calib/nonempty_final_conf_rate": 0.875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4276160714285714,
"calib/std_conf": 0.001914444789499249,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.906287184284378,
"calib/step_q_c_n": 1069.0,
"calib/step_q_gap": 0.0062476985176195265,
"calib/step_q_w": 0.9000394857667585,
"calib/step_q_w_n": 1089.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2950.0,
"completions/max_terminated_length": 2950.0,
"completions/mean_length": 1086.8125,
"completions/mean_terminated_length": 1173.94091796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 640.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.0050511229783296585,
"kl": 0.032291412353515625,
"learning_rate": 4.694444444444445e-06,
"loss": 0.0117,
"num_tokens": 10264018.0,
"reward": 0.6780734658241272,
"reward_std": 0.25499579310417175,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/asymmetric_l2_reward": 0.5831027626991272,
"rewards/final_brier_reward_step": 0.4996066093444824,
"rewards/format_reward_step": 0.875,
"step": 31
},
{
"calib/answer_extract_rate": 0.90625,
"calib/auroc": 0.4933823529411765,
"calib/avg_num_step_conf": 8.265625,
"calib/ece": 0.4012640692640692,
"calib/final_conf_rate": 0.90234375,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00013986068111493388,
"calib/mean_conf": 0.990008658008658,
"calib/mu_c": 0.9900661764705883,
"calib/mu_w": 0.9899263157894733,
"calib/nonempty_final_conf_rate": 0.90234375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4012640692640692,
"calib/std_conf": 0.0024335165832523125,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9063417611159548,
"calib/step_q_c_n": 1147.0,
"calib/step_q_gap": 0.0021353627671416575,
"calib/step_q_w": 0.9042063983488131,
"calib/step_q_w_n": 969.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3048.0,
"completions/max_terminated_length": 3048.0,
"completions/mean_length": 991.8203125,
"completions/mean_terminated_length": 1053.5518798828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 593.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.005195594392716885,
"kl": 0.04204559326171875,
"learning_rate": 4.666666666666667e-06,
"loss": -0.051,
"num_tokens": 10624628.0,
"reward": 0.714741587638855,
"reward_std": 0.2588036060333252,
"rewards/accuracy_reward_step": 0.53125,
"rewards/asymmetric_l2_reward": 0.6041333675384521,
"rewards/final_brier_reward_step": 0.5386311411857605,
"rewards/format_reward_step": 0.90234375,
"step": 32
},
{
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.4734042553191489,
"calib/avg_num_step_conf": 8.2734375,
"calib/ece": 0.3802282157676349,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0004787234042552546,
"calib/mean_conf": 0.9901867219917013,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.990478723404255,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3802282157676349,
"calib/std_conf": 0.0012828222102560924,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.906288998357964,
"calib/step_q_c_n": 1218.0,
"calib/step_q_gap": 0.002572331691297225,
"calib/step_q_w": 0.9037166666666667,
"calib/step_q_w_n": 900.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2737.0,
"completions/max_terminated_length": 2737.0,
"completions/mean_length": 908.328125,
"completions/mean_terminated_length": 956.9217529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 527.0,
"epoch": 0.0352,
"grad_norm": 0.005166413728147745,
"kl": 0.0454559326171875,
"learning_rate": 4.638888888888889e-06,
"loss": -0.0887,
"num_tokens": 10964032.0,
"reward": 0.7751585245132446,
"reward_std": 0.2357899397611618,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/asymmetric_l2_reward": 0.6716193556785583,
"rewards/final_brier_reward_step": 0.5771350860595703,
"rewards/format_reward_step": 0.93359375,
"step": 33
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5082296266559614,
"calib/avg_num_step_conf": 8.234375,
"calib/ece": 0.41906437246963557,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0003461394353003744,
"calib/mean_conf": 0.9899145748987853,
"calib/mu_c": 0.9900631205673758,
"calib/mu_w": 0.9897169811320754,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.41906437246963557,
"calib/std_conf": 0.0021831221656469823,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9054787322768975,
"calib/step_q_c_n": 1199.0,
"calib/step_q_gap": 0.0013093153352032871,
"calib/step_q_w": 0.9041694169416942,
"calib/step_q_w_n": 909.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2385.0,
"completions/max_terminated_length": 2385.0,
"completions/mean_length": 924.50390625,
"completions/mean_terminated_length": 942.9203491210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 604.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.006008944008499384,
"kl": 0.046756744384765625,
"learning_rate": 4.611111111111112e-06,
"loss": -0.01,
"num_tokens": 11305817.0,
"reward": 0.7865276336669922,
"reward_std": 0.3068256378173828,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/asymmetric_l2_reward": 0.7107362151145935,
"rewards/final_brier_reward_step": 0.5591940879821777,
"rewards/format_reward_step": 0.96484375,
"step": 34
},
{
"calib/answer_extract_rate": 0.9140625,
"calib/auroc": 0.5233333333333333,
"calib/avg_num_step_conf": 7.85546875,
"calib/ece": 0.4155617021276594,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0007481481481481644,
"calib/mean_conf": 0.9900297872340424,
"calib/mu_c": 0.990348148148148,
"calib/mu_w": 0.9895999999999998,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4155617021276594,
"calib/std_conf": 0.002948343559011518,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9087988980716254,
"calib/step_q_c_n": 1089.0,
"calib/step_q_gap": 0.001499548830844466,
"calib/step_q_w": 0.9072993492407809,
"calib/step_q_w_n": 922.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2616.0,
"completions/max_terminated_length": 2616.0,
"completions/mean_length": 962.83984375,
"completions/mean_terminated_length": 1027.0291748046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 577.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.004935343284159899,
"kl": 0.04593658447265625,
"learning_rate": 4.583333333333333e-06,
"loss": -0.0778,
"num_tokens": 11661560.0,
"reward": 0.7069214582443237,
"reward_std": 0.28172507882118225,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/asymmetric_l2_reward": 0.5961191654205322,
"rewards/final_brier_reward_step": 0.5310050249099731,
"rewards/format_reward_step": 0.90625,
"step": 35
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.47736757624398074,
"calib/avg_num_step_conf": 8.796875,
"calib/ece": 0.2724637096774194,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00043001605136394616,
"calib/mean_conf": 0.9902056451612904,
"calib/mu_c": 0.9900842696629213,
"calib/mu_w": 0.9905142857142852,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2724637096774194,
"calib/std_conf": 0.0016045702077219465,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9096727501573316,
"calib/step_q_c_n": 1589.0,
"calib/step_q_gap": -0.002705832044779921,
"calib/step_q_w": 0.9123785822021115,
"calib/step_q_w_n": 663.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3058.0,
"completions/max_terminated_length": 3058.0,
"completions/mean_length": 958.34375,
"completions/mean_terminated_length": 977.4342651367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 551.0,
"epoch": 0.0384,
"grad_norm": 0.041168417781591415,
"kl": 0.22402572631835938,
"learning_rate": 4.555555555555556e-06,
"loss": -0.0138,
"num_tokens": 12009608.0,
"reward": 0.9012855291366577,
"reward_std": 0.21603024005889893,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/asymmetric_l2_reward": 0.769353985786438,
"rewards/final_brier_reward_step": 0.7004045248031616,
"rewards/format_reward_step": 0.96875,
"step": 36
},
{
"calib/answer_extract_rate": 0.921875,
"calib/auroc": 0.5127486892192774,
"calib/avg_num_step_conf": 7.6796875,
"calib/ece": 0.4942669491525423,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0004327371974431893,
"calib/mean_conf": 0.9900296610169491,
"calib/mu_c": 0.9902478632478632,
"calib/mu_w": 0.98981512605042,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4942669491525423,
"calib/std_conf": 0.0024725547524240125,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9118057482656096,
"calib/step_q_c_n": 1009.0,
"calib/step_q_gap": 0.009423303124543647,
"calib/step_q_w": 0.9023824451410659,
"calib/step_q_w_n": 957.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2911.0,
"completions/max_terminated_length": 2911.0,
"completions/mean_length": 1007.19140625,
"completions/mean_terminated_length": 1065.4586181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 613.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.0046908557415008545,
"kl": 0.04862213134765625,
"learning_rate": 4.527777777777778e-06,
"loss": -0.0334,
"num_tokens": 12374545.0,
"reward": 0.6690409183502197,
"reward_std": 0.21596181392669678,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/asymmetric_l2_reward": 0.595897912979126,
"rewards/final_brier_reward_step": 0.46640270948410034,
"rewards/format_reward_step": 0.921875,
"step": 37
},
{
"calib/answer_extract_rate": 0.91796875,
"calib/auroc": 0.4941360722610722,
"calib/avg_num_step_conf": 7.88671875,
"calib/ece": 0.431148305084746,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00010460372960396924,
"calib/mean_conf": 0.990470338983051,
"calib/mu_c": 0.9904242424242423,
"calib/mu_w": 0.9905288461538463,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.431148305084746,
"calib/std_conf": 0.002415339724921821,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9140769911504424,
"calib/step_q_c_n": 1130.0,
"calib/step_q_gap": 0.007677666066077893,
"calib/step_q_w": 0.9063993250843645,
"calib/step_q_w_n": 889.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2742.0,
"completions/max_terminated_length": 2742.0,
"completions/mean_length": 929.5625,
"completions/mean_terminated_length": 995.6820068359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 570.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.005542919039726257,
"kl": 0.06835556030273438,
"learning_rate": 4.5e-06,
"loss": -0.0711,
"num_tokens": 12719401.0,
"reward": 0.7114863395690918,
"reward_std": 0.25867563486099243,
"rewards/accuracy_reward_step": 0.515625,
"rewards/asymmetric_l2_reward": 0.613100528717041,
"rewards/final_brier_reward_step": 0.5231534838676453,
"rewards/format_reward_step": 0.91796875,
"step": 38
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.4365874009178139,
"calib/avg_num_step_conf": 8.25,
"calib/ece": 0.41026666666666656,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0007673341677096879,
"calib/mean_conf": 0.9905135802469135,
"calib/mu_c": 0.9901914893617019,
"calib/mu_w": 0.9909588235294116,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.41026666666666656,
"calib/std_conf": 0.0040809328722435515,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9138510808646918,
"calib/step_q_c_n": 1249.0,
"calib/step_q_gap": 0.0021768050825363705,
"calib/step_q_w": 0.9116742757821554,
"calib/step_q_w_n": 863.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2830.0,
"completions/max_terminated_length": 2830.0,
"completions/mean_length": 998.80078125,
"completions/mean_terminated_length": 1031.0201416015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 520.0,
"epoch": 0.0416,
"grad_norm": 0.005000025033950806,
"kl": 0.056304931640625,
"learning_rate": 4.472222222222223e-06,
"loss": -0.0509,
"num_tokens": 13081182.0,
"reward": 0.7604011297225952,
"reward_std": 0.26943159103393555,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/asymmetric_l2_reward": 0.6646355390548706,
"rewards/final_brier_reward_step": 0.5577292442321777,
"rewards/format_reward_step": 0.94140625,
"step": 39
},
{
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.5077826419289834,
"calib/avg_num_step_conf": 8.2578125,
"calib/ece": 0.4777125000000001,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0001811548884721459,
"calib/mean_conf": 0.9902125,
"calib/mu_c": 0.9903008130081301,
"calib/mu_w": 0.990119658119658,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4777125000000001,
"calib/std_conf": 0.001868781354252018,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9149052319842053,
"calib/step_q_c_n": 1013.0,
"calib/step_q_gap": 0.004578256507365985,
"calib/step_q_w": 0.9103269754768393,
"calib/step_q_w_n": 1101.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2863.0,
"completions/max_terminated_length": 2863.0,
"completions/mean_length": 975.87890625,
"completions/mean_terminated_length": 1015.5487670898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 541.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.004450581502169371,
"kl": 0.0665130615234375,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0089,
"num_tokens": 13437767.0,
"reward": 0.6698815822601318,
"reward_std": 0.24097877740859985,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/asymmetric_l2_reward": 0.5667624473571777,
"rewards/final_brier_reward_step": 0.48940688371658325,
"rewards/format_reward_step": 0.9375,
"step": 40
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.515946783305996,
"calib/avg_num_step_conf": 8.76171875,
"calib/ece": 0.23104081632653062,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00031802442135930864,
"calib/mean_conf": 0.9902244897959184,
"calib/mu_c": 0.9903010752688172,
"calib/mu_w": 0.9899830508474579,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.23104081632653062,
"calib/std_conf": 0.0016814146876772244,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9165045262522632,
"calib/step_q_c_n": 1657.0,
"calib/step_q_gap": 0.00827756379492528,
"calib/step_q_w": 0.9082269624573379,
"calib/step_q_w_n": 586.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2973.0,
"completions/max_terminated_length": 2973.0,
"completions/mean_length": 935.36328125,
"completions/mean_terminated_length": 953.9960327148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 535.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.09011942893266678,
"kl": 0.38977813720703125,
"learning_rate": 4.416666666666667e-06,
"loss": -0.0148,
"num_tokens": 13784468.0,
"reward": 0.9118338823318481,
"reward_std": 0.25103092193603516,
"rewards/accuracy_reward_step": 0.7265625,
"rewards/asymmetric_l2_reward": 0.7605504989624023,
"rewards/final_brier_reward_step": 0.7271796464920044,
"rewards/format_reward_step": 0.953125,
"step": 41
},
{
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.5044742729306487,
"calib/avg_num_step_conf": 9.09375,
"calib/ece": 0.36661087467081876,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.99581589958159,
"calib/gap": 0.0012255885233013464,
"calib/mean_conf": 0.990041837013915,
"calib/mu_c": 0.9905033557046978,
"calib/mu_w": 0.9892777671813965,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.36661087467081876,
"calib/std_conf": 0.0065371684647353365,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9178921282798833,
"calib/step_q_c_n": 1372.0,
"calib/step_q_gap": 0.002306356216440375,
"calib/step_q_w": 0.9155857720634429,
"calib/step_q_w_n": 956.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2878.0,
"completions/max_terminated_length": 2878.0,
"completions/mean_length": 883.82421875,
"completions/mean_terminated_length": 931.1069946289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 531.0,
"epoch": 0.0448,
"grad_norm": 0.004536244552582502,
"kl": 0.08171844482421875,
"learning_rate": 4.388888888888889e-06,
"loss": -0.0463,
"num_tokens": 14115095.0,
"reward": 0.7774260640144348,
"reward_std": 0.20005618035793304,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.6669763922691345,
"rewards/final_brier_reward_step": 0.585532009601593,
"rewards/format_reward_step": 0.9296875,
"step": 42
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5237985567326885,
"calib/avg_num_step_conf": 9.2734375,
"calib/ece": 0.30875510204081646,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00041839398126841676,
"calib/mean_conf": 0.9903877551020409,
"calib/mu_c": 0.9905209580838322,
"calib/mu_w": 0.9901025641025638,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.30875510204081646,
"calib/std_conf": 0.001809457315539576,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9154681262073407,
"calib/step_q_c_n": 1553.0,
"calib/step_q_gap": 0.0038030835764029103,
"calib/step_q_w": 0.9116650426309378,
"calib/step_q_w_n": 821.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 3019.0,
"completions/max_terminated_length": 3019.0,
"completions/mean_length": 960.828125,
"completions/mean_terminated_length": 987.8392944335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 591.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.0043022241443395615,
"kl": 0.07738494873046875,
"learning_rate": 4.361111111111112e-06,
"loss": 0.0064,
"num_tokens": 14466291.0,
"reward": 0.8499053120613098,
"reward_std": 0.2604464888572693,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/asymmetric_l2_reward": 0.7243393063545227,
"rewards/final_brier_reward_step": 0.6543775796890259,
"rewards/format_reward_step": 0.953125,
"step": 43
},
{
"calib/answer_extract_rate": 0.92578125,
"calib/auroc": 0.5004680973642518,
"calib/avg_num_step_conf": 9.03515625,
"calib/ece": 0.43760759493670887,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 5.329108454577014e-05,
"calib/mean_conf": 0.9903502109704642,
"calib/mu_c": 0.9903740458015268,
"calib/mu_w": 0.990320754716981,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.43760759493670887,
"calib/std_conf": 0.0017883029334125153,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.91791894127378,
"calib/step_q_c_n": 1209.0,
"calib/step_q_gap": 0.007094665911461129,
"calib/step_q_w": 0.9108242753623189,
"calib/step_q_w_n": 1104.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2998.0,
"completions/max_terminated_length": 2998.0,
"completions/mean_length": 933.0859375,
"completions/mean_terminated_length": 978.975341796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 582.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.004361076280474663,
"kl": 0.08103179931640625,
"learning_rate": 4.333333333333334e-06,
"loss": -0.0364,
"num_tokens": 14811481.0,
"reward": 0.7002480030059814,
"reward_std": 0.22973665595054626,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/asymmetric_l2_reward": 0.598037838935852,
"rewards/final_brier_reward_step": 0.5157393217086792,
"rewards/format_reward_step": 0.921875,
"step": 44
},
{
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.5265525565064736,
"calib/avg_num_step_conf": 10.54296875,
"calib/ece": 0.3753968055555554,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.9958333333333333,
"calib/gap": 0.0076378245922025245,
"calib/mean_conf": 0.9878968055555556,
"calib/mu_c": 0.9908564625850338,
"calib/mu_w": 0.9832186379928313,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.3753968055555554,
"calib/std_conf": 0.042494877076636975,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.918011901504788,
"calib/step_q_c_n": 1462.0,
"calib/step_q_gap": 0.007727342086841005,
"calib/step_q_w": 0.910284559417947,
"calib/step_q_w_n": 1237.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2732.0,
"completions/max_terminated_length": 2732.0,
"completions/mean_length": 941.203125,
"completions/mean_terminated_length": 987.4917602539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 534.0,
"epoch": 0.048,
"grad_norm": 0.004294428043067455,
"kl": 0.08637237548828125,
"learning_rate": 4.305555555555556e-06,
"loss": -0.0266,
"num_tokens": 15157477.0,
"reward": 0.7546592354774475,
"reward_std": 0.2604624629020691,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/asymmetric_l2_reward": 0.6334857940673828,
"rewards/final_brier_reward_step": 0.5766139626502991,
"rewards/format_reward_step": 0.921875,
"step": 45
},
{
"calib/answer_extract_rate": 0.9140625,
"calib/auroc": 0.48762157382847043,
"calib/avg_num_step_conf": 10.15625,
"calib/ece": 0.49259656652360534,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00024506336575280674,
"calib/mean_conf": 0.9904506437768242,
"calib/mu_c": 0.9903275862068965,
"calib/mu_w": 0.9905726495726493,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.49259656652360534,
"calib/std_conf": 0.0022290343214388304,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9128881685575364,
"calib/step_q_c_n": 1234.0,
"calib/step_q_gap": -0.0026799134336789043,
"calib/step_q_w": 0.9155680819912153,
"calib/step_q_w_n": 1366.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2859.0,
"completions/max_terminated_length": 2859.0,
"completions/mean_length": 952.4609375,
"completions/mean_terminated_length": 1011.7427978515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 524.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.003909003920853138,
"kl": 0.08541107177734375,
"learning_rate": 4.277777777777778e-06,
"loss": -0.0688,
"num_tokens": 15506075.0,
"reward": 0.6511839628219604,
"reward_std": 0.23548802733421326,
"rewards/accuracy_reward_step": 0.453125,
"rewards/asymmetric_l2_reward": 0.5727441310882568,
"rewards/final_brier_reward_step": 0.45774880051612854,
"rewards/format_reward_step": 0.90625,
"step": 46
},
{
"calib/answer_extract_rate": 0.921875,
"calib/auroc": 0.47734067663257274,
"calib/avg_num_step_conf": 9.25,
"calib/ece": 0.3367299578059072,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00041864673485447756,
"calib/mean_conf": 0.9907383966244726,
"calib/mu_c": 0.9905935483870968,
"calib/mu_w": 0.9910121951219513,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3367299578059072,
"calib/std_conf": 0.0028222896641156934,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9156944444444445,
"calib/step_q_c_n": 1512.0,
"calib/step_q_gap": 0.007248182762201516,
"calib/step_q_w": 0.908446261682243,
"calib/step_q_w_n": 856.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 3053.0,
"completions/max_terminated_length": 3053.0,
"completions/mean_length": 952.6484375,
"completions/mean_terminated_length": 1003.6131591796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 608.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.004438928794115782,
"kl": 0.09577178955078125,
"learning_rate": 4.25e-06,
"loss": -0.0408,
"num_tokens": 15855929.0,
"reward": 0.7888574600219727,
"reward_std": 0.22867697477340698,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/asymmetric_l2_reward": 0.6705582737922668,
"rewards/final_brier_reward_step": 0.6032503247261047,
"rewards/format_reward_step": 0.9140625,
"step": 47
},
{
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.4854422903203391,
"calib/avg_num_step_conf": 10.80859375,
"calib/ece": 0.4783245833333335,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00014048363560548172,
"calib/mean_conf": 0.9908245833333335,
"calib/mu_c": 0.9907560975609756,
"calib/mu_w": 0.9908965811965811,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4783245833333335,
"calib/std_conf": 0.003715297743975429,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9193959881129272,
"calib/step_q_c_n": 1346.0,
"calib/step_q_gap": 0.011938352644946915,
"calib/step_q_w": 0.9074576354679803,
"calib/step_q_w_n": 1421.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2835.0,
"completions/max_terminated_length": 2835.0,
"completions/mean_length": 943.671875,
"completions/mean_terminated_length": 986.040771484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 606.0,
"epoch": 0.0512,
"grad_norm": 0.004192838445305824,
"kl": 0.10528564453125,
"learning_rate": 4.222222222222223e-06,
"loss": -0.0329,
"num_tokens": 16201197.0,
"reward": 0.6844377517700195,
"reward_std": 0.32408520579338074,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/asymmetric_l2_reward": 0.601270854473114,
"rewards/final_brier_reward_step": 0.4847921133041382,
"rewards/format_reward_step": 0.93359375,
"step": 48
},
{
"calib/answer_extract_rate": 0.9140625,
"calib/auroc": 0.4895036678632746,
"calib/avg_num_step_conf": 11.13671875,
"calib/ece": 0.35335035460992914,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 0.9872340425531915,
"calib/gap": 0.0024480255970035225,
"calib/mean_conf": 0.9845560283687944,
"calib/mu_c": 0.9854519015659954,
"calib/mu_w": 0.9830038759689919,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3519319148936171,
"calib/std_conf": 0.06066803876025451,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9165114773396116,
"calib/step_q_c_n": 1699.0,
"calib/step_q_gap": 0.01334047039516717,
"calib/step_q_w": 0.9031710069444444,
"calib/step_q_w_n": 1152.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2891.0,
"completions/max_terminated_length": 2891.0,
"completions/mean_length": 890.3984375,
"completions/mean_terminated_length": 949.7583618164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 527.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.0043127527460455894,
"kl": 0.100250244140625,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0883,
"num_tokens": 16533675.0,
"reward": 0.7699627876281738,
"reward_std": 0.2641654908657074,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.6509315967559814,
"rewards/final_brier_reward_step": 0.5897751450538635,
"rewards/format_reward_step": 0.9140625,
"step": 49
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5242763772175536,
"calib/avg_num_step_conf": 10.9375,
"calib/ece": 0.3634918032786886,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0007050204697266471,
"calib/mean_conf": 0.9905409836065574,
"calib/mu_c": 0.9908039215686276,
"calib/mu_w": 0.990098901098901,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3634918032786886,
"calib/std_conf": 0.003249583935495379,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9180515151515152,
"calib/step_q_c_n": 1650.0,
"calib/step_q_gap": 0.004113254281949974,
"calib/step_q_w": 0.9139382608695652,
"calib/step_q_w_n": 1150.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2566.0,
"completions/max_terminated_length": 2566.0,
"completions/mean_length": 905.09765625,
"completions/mean_terminated_length": 926.820068359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.003878743853420019,
"kl": 0.1117095947265625,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0217,
"num_tokens": 16870740.0,
"reward": 0.7831500172615051,
"reward_std": 0.28436478972435,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/asymmetric_l2_reward": 0.6562309265136719,
"rewards/final_brier_reward_step": 0.6006940603256226,
"rewards/format_reward_step": 0.94921875,
"step": 50
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.500757460990759,
"calib/avg_num_step_conf": 10.390625,
"calib/ece": 0.3277983539094652,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00015308286623205714,
"calib/mean_conf": 0.9903497942386833,
"calib/mu_c": 0.9902981366459629,
"calib/mu_w": 0.990451219512195,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3277983539094652,
"calib/std_conf": 0.00372692764480295,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9127832009080591,
"calib/step_q_c_n": 1762.0,
"calib/step_q_gap": -0.0003715875106490296,
"calib/step_q_w": 0.9131547884187081,
"calib/step_q_w_n": 898.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2950.0,
"completions/max_terminated_length": 2950.0,
"completions/mean_length": 875.6015625,
"completions/mean_terminated_length": 911.195068359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 550.0,
"epoch": 0.0544,
"grad_norm": 0.004427890758961439,
"kl": 0.106109619140625,
"learning_rate": 4.138888888888889e-06,
"loss": -0.0402,
"num_tokens": 17204190.0,
"reward": 0.8117591142654419,
"reward_std": 0.2127530574798584,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/asymmetric_l2_reward": 0.6729713678359985,
"rewards/final_brier_reward_step": 0.6349219083786011,
"rewards/format_reward_step": 0.94921875,
"step": 51
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.48323471400394485,
"calib/avg_num_step_conf": 10.9140625,
"calib/ece": 0.3018319672131149,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9959016393442623,
"calib/gap": -0.006114635108481092,
"calib/mean_conf": 0.9862581967213114,
"calib/mu_c": 0.9843786982248521,
"calib/mu_w": 0.9904933333333332,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2977336065573772,
"calib/std_conf": 0.06331118609908344,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9123175487465182,
"calib/step_q_c_n": 1795.0,
"calib/step_q_gap": -3.883797136672307e-06,
"calib/step_q_w": 0.9123214325436548,
"calib/step_q_w_n": 999.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2795.0,
"completions/max_terminated_length": 2795.0,
"completions/mean_length": 898.99609375,
"completions/mean_terminated_length": 927.9959106445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 545.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.004448126070201397,
"kl": 0.111572265625,
"learning_rate": 4.111111111111111e-06,
"loss": 0.0072,
"num_tokens": 17542285.0,
"reward": 0.8596779108047485,
"reward_std": 0.25896918773651123,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/asymmetric_l2_reward": 0.7349734306335449,
"rewards/final_brier_reward_step": 0.661726176738739,
"rewards/format_reward_step": 0.953125,
"step": 52
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5007157464212679,
"calib/avg_num_step_conf": 9.921875,
"calib/ece": 0.34688142292490115,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9960474308300395,
"calib/gap": -0.0008140422631220945,
"calib/mean_conf": 0.9899644268774704,
"calib/mu_c": 0.9896748466257669,
"calib/mu_w": 0.990488888888889,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3462885375494071,
"calib/std_conf": 0.009443127003087047,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9152066167290887,
"calib/step_q_c_n": 1602.0,
"calib/step_q_gap": 0.002208748925250692,
"calib/step_q_w": 0.912997867803838,
"calib/step_q_w_n": 938.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1759.0,
"completions/max_terminated_length": 1759.0,
"completions/mean_length": 919.34375,
"completions/mean_terminated_length": 930.2451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 493.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.004339267965406179,
"kl": 0.11077880859375,
"learning_rate": 4.083333333333334e-06,
"loss": -0.0096,
"num_tokens": 17883461.0,
"reward": 0.8446915149688721,
"reward_std": 0.24097609519958496,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/asymmetric_l2_reward": 0.7220233082771301,
"rewards/final_brier_reward_step": 0.6431408524513245,
"rewards/format_reward_step": 0.984375,
"step": 53
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5237379332172811,
"calib/avg_num_step_conf": 11.91796875,
"calib/ece": 0.27575502008032127,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00048520335496160527,
"calib/mean_conf": 0.9906144578313253,
"calib/mu_c": 0.9907528089887639,
"calib/mu_w": 0.9902676056338023,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.27575502008032127,
"calib/std_conf": 0.0026684140498420606,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9163947105788423,
"calib/step_q_c_n": 2004.0,
"calib/step_q_gap": 0.006814958907400026,
"calib/step_q_w": 0.9095797516714422,
"calib/step_q_w_n": 1047.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 1784.0,
"completions/max_terminated_length": 1784.0,
"completions/mean_length": 842.21875,
"completions/mean_terminated_length": 862.4320678710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 537.0,
"epoch": 0.0576,
"grad_norm": 0.004163231234997511,
"kl": 0.1133880615234375,
"learning_rate": 4.055555555555556e-06,
"loss": -0.0272,
"num_tokens": 18205301.0,
"reward": 0.8736650943756104,
"reward_std": 0.26025390625,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/asymmetric_l2_reward": 0.7224922180175781,
"rewards/final_brier_reward_step": 0.692806601524353,
"rewards/format_reward_step": 0.96484375,
"step": 54
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.49713242961418136,
"calib/avg_num_step_conf": 10.42578125,
"calib/ece": 0.44043333333333345,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00014805135557871552,
"calib/mean_conf": 0.9906341365461848,
"calib/mu_c": 0.9907007299270073,
"calib/mu_w": 0.9905526785714286,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.44043333333333345,
"calib/std_conf": 0.0032175970974869994,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9174707412223667,
"calib/step_q_c_n": 1538.0,
"calib/step_q_gap": 0.005732191266575315,
"calib/step_q_w": 0.9117385499557914,
"calib/step_q_w_n": 1131.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2272.0,
"completions/max_terminated_length": 2272.0,
"completions/mean_length": 862.86328125,
"completions/mean_terminated_length": 883.572021484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 498.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.004341635387390852,
"kl": 0.1074371337890625,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0263,
"num_tokens": 18534018.0,
"reward": 0.7443338632583618,
"reward_std": 0.19267994165420532,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/asymmetric_l2_reward": 0.6437778472900391,
"rewards/final_brier_reward_step": 0.5433272123336792,
"rewards/format_reward_step": 0.97265625,
"step": 55
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5393295718552937,
"calib/avg_num_step_conf": 10.25390625,
"calib/ece": 0.4578983739837398,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0009985396614667907,
"calib/mean_conf": 0.9904186991869918,
"calib/mu_c": 0.990885496183206,
"calib/mu_w": 0.9898869565217392,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4578983739837398,
"calib/std_conf": 0.0035635199916164025,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9186488388458831,
"calib/step_q_c_n": 1421.0,
"calib/step_q_gap": 0.005492692666481114,
"calib/step_q_w": 0.913156146179402,
"calib/step_q_w_n": 1204.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2633.0,
"completions/max_terminated_length": 2633.0,
"completions/mean_length": 868.015625,
"completions/mean_terminated_length": 896.01611328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 563.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.004599003586918116,
"kl": 0.11109161376953125,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0241,
"num_tokens": 18863070.0,
"reward": 0.7282382249832153,
"reward_std": 0.27239811420440674,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/asymmetric_l2_reward": 0.6412410140037537,
"rewards/final_brier_reward_step": 0.5207042694091797,
"rewards/format_reward_step": 0.9609375,
"step": 56
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.48916912822843295,
"calib/avg_num_step_conf": 10.65234375,
"calib/ece": 0.3227213114754097,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.9959016393442623,
"calib/gap": -0.0008485950162842038,
"calib/mean_conf": 0.9907540983606556,
"calib/mu_c": 0.9904723926380369,
"calib/mu_w": 0.9913209876543211,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3227213114754097,
"calib/std_conf": 0.00696809059083824,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9192886836027714,
"calib/step_q_c_n": 1732.0,
"calib/step_q_gap": 0.012469588125384345,
"calib/step_q_w": 0.906819095477387,
"calib/step_q_w_n": 995.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3027.0,
"completions/max_terminated_length": 3027.0,
"completions/mean_length": 854.36328125,
"completions/mean_terminated_length": 881.92333984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 564.0,
"epoch": 0.0608,
"grad_norm": 0.0046896119602024555,
"kl": 0.10809326171875,
"learning_rate": 3.972222222222223e-06,
"loss": 0.0136,
"num_tokens": 19188579.0,
"reward": 0.8231363296508789,
"reward_std": 0.21880272030830383,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/asymmetric_l2_reward": 0.6887975931167603,
"rewards/final_brier_reward_step": 0.641849935054779,
"rewards/format_reward_step": 0.94140625,
"step": 57
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5048640724946695,
"calib/avg_num_step_conf": 10.06640625,
"calib/ece": 0.4460243902439024,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -5.143923240968906e-05,
"calib/mean_conf": 0.9907398373983739,
"calib/mu_c": 0.9907164179104475,
"calib/mu_w": 0.9907678571428572,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4460243902439024,
"calib/std_conf": 0.0034999126548311047,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9133302611367129,
"calib/step_q_c_n": 1302.0,
"calib/step_q_gap": 0.000403986626908992,
"calib/step_q_w": 0.9129262745098039,
"calib/step_q_w_n": 1275.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2097.0,
"completions/max_terminated_length": 2097.0,
"completions/mean_length": 907.40625,
"completions/mean_terminated_length": 936.6773681640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 497.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.004239367786794901,
"kl": 0.0998992919921875,
"learning_rate": 3.944444444444445e-06,
"loss": -0.0459,
"num_tokens": 19527195.0,
"reward": 0.7133985757827759,
"reward_std": 0.27472320199012756,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/asymmetric_l2_reward": 0.6040467619895935,
"rewards/final_brier_reward_step": 0.5274378061294556,
"rewards/format_reward_step": 0.953125,
"step": 58
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.48307274841710807,
"calib/avg_num_step_conf": 9.89453125,
"calib/ece": 0.4240278884462152,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9920318725099602,
"calib/gap": 0.0011575138906835258,
"calib/mean_conf": 0.9897649402390439,
"calib/mu_c": 0.9902676056338026,
"calib/mu_w": 0.989110091743119,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4240278884462152,
"calib/std_conf": 0.009348313326382543,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9131354883081156,
"calib/step_q_c_n": 1454.0,
"calib/step_q_gap": -0.00026858954174535654,
"calib/step_q_w": 0.913404077849861,
"calib/step_q_w_n": 1079.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2588.0,
"completions/max_terminated_length": 2588.0,
"completions/mean_length": 857.40234375,
"completions/mean_terminated_length": 871.011962890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 490.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.004450817126780748,
"kl": 0.1063232421875,
"learning_rate": 3.916666666666667e-06,
"loss": -0.0098,
"num_tokens": 19852938.0,
"reward": 0.7629809379577637,
"reward_std": 0.28226304054260254,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/asymmetric_l2_reward": 0.6598451733589172,
"rewards/final_brier_reward_step": 0.5598666667938232,
"rewards/format_reward_step": 0.9765625,
"step": 59
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.4952478384265065,
"calib/avg_num_step_conf": 9.59765625,
"calib/ece": 0.43016895161290347,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00022657910368950596,
"calib/mean_conf": 0.9906528225806454,
"calib/mu_c": 0.9905532374100718,
"calib/mu_w": 0.9907798165137613,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.43016895161290347,
"calib/std_conf": 0.0036146643668052265,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9191200987306065,
"calib/step_q_c_n": 1418.0,
"calib/step_q_gap": 0.007535883138691246,
"calib/step_q_w": 0.9115842155919153,
"calib/step_q_w_n": 1039.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2479.0,
"completions/max_terminated_length": 2479.0,
"completions/mean_length": 830.61328125,
"completions/mean_terminated_length": 853.9638061523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 525.0,
"epoch": 0.064,
"grad_norm": 0.004765588324517012,
"kl": 0.1033172607421875,
"learning_rate": 3.88888888888889e-06,
"loss": -0.0186,
"num_tokens": 20174431.0,
"reward": 0.7508342862129211,
"reward_std": 0.2844465672969818,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/asymmetric_l2_reward": 0.648601770401001,
"rewards/final_brier_reward_step": 0.5507229566574097,
"rewards/format_reward_step": 0.96875,
"step": 60
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5230346576500422,
"calib/avg_num_step_conf": 10.578125,
"calib/ece": 0.3233754940711463,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0004493519301215221,
"calib/mean_conf": 0.9913596837944665,
"calib/mu_c": 0.9915088757396451,
"calib/mu_w": 0.9910595238095236,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3233754940711463,
"calib/std_conf": 0.0035107225784080016,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9175910396323952,
"calib/step_q_c_n": 1741.0,
"calib/step_q_gap": 0.007756499818537987,
"calib/step_q_w": 0.9098345398138572,
"calib/step_q_w_n": 967.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1689.0,
"completions/max_terminated_length": 1689.0,
"completions/mean_length": 803.07421875,
"completions/mean_terminated_length": 809.3976440429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 460.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.004396671429276466,
"kl": 0.1050567626953125,
"learning_rate": 3.861111111111112e-06,
"loss": 0.0029,
"num_tokens": 20484082.0,
"reward": 0.8529362082481384,
"reward_std": 0.2431645393371582,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/asymmetric_l2_reward": 0.7111063003540039,
"rewards/final_brier_reward_step": 0.6658596992492676,
"rewards/format_reward_step": 0.984375,
"step": 61
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.45560005085176714,
"calib/avg_num_step_conf": 10.02734375,
"calib/ece": 0.4439281746031747,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0008107932875668356,
"calib/mean_conf": 0.9915472222222224,
"calib/mu_c": 0.9911804347826085,
"calib/mu_w": 0.9919912280701754,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4439281746031747,
"calib/std_conf": 0.0035269641918985748,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9174202823179792,
"calib/step_q_c_n": 1346.0,
"calib/step_q_gap": 2.7161924858742914e-05,
"calib/step_q_w": 0.9173931203931205,
"calib/step_q_w_n": 1221.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2473.0,
"completions/max_terminated_length": 2473.0,
"completions/mean_length": 835.9921875,
"completions/mean_terminated_length": 845.9051513671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 485.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.004839341156184673,
"kl": 0.10821533203125,
"learning_rate": 3.833333333333334e-06,
"loss": 0.0001,
"num_tokens": 20805176.0,
"reward": 0.7392706871032715,
"reward_std": 0.31165140867233276,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/asymmetric_l2_reward": 0.6277410984039307,
"rewards/final_brier_reward_step": 0.5461127161979675,
"rewards/format_reward_step": 0.984375,
"step": 62
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5094806802632437,
"calib/avg_num_step_conf": 8.9765625,
"calib/ece": 0.399904761904762,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00022962142438298638,
"calib/mean_conf": 0.9911746031746033,
"calib/mu_c": 0.9912684563758388,
"calib/mu_w": 0.9910388349514558,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.399904761904762,
"calib/std_conf": 0.003223375350785793,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9177375000000001,
"calib/step_q_c_n": 1440.0,
"calib/step_q_gap": 0.010147756410256559,
"calib/step_q_w": 0.9075897435897435,
"calib/step_q_w_n": 858.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2837.0,
"completions/max_terminated_length": 2837.0,
"completions/mean_length": 866.95703125,
"completions/mean_terminated_length": 873.783447265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 442.0,
"epoch": 0.0672,
"grad_norm": 0.004709714557975531,
"kl": 0.096405029296875,
"learning_rate": 3.8055555555555556e-06,
"loss": 0.0354,
"num_tokens": 21135757.0,
"reward": 0.784890353679657,
"reward_std": 0.2856467366218567,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.6720311641693115,
"rewards/final_brier_reward_step": 0.5852494239807129,
"rewards/format_reward_step": 0.98046875,
"step": 63
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5216008771929825,
"calib/avg_num_step_conf": 9.19140625,
"calib/ece": 0.309864143426295,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00047931286549685037,
"calib/mean_conf": 0.9911390438247013,
"calib/mu_c": 0.9912918128654971,
"calib/mu_w": 0.9908125000000002,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.309864143426295,
"calib/std_conf": 0.0030717562948793975,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9159911111111112,
"calib/step_q_c_n": 1575.0,
"calib/step_q_gap": 0.00026231933733222856,
"calib/step_q_w": 0.915728791773779,
"calib/step_q_w_n": 778.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2331.0,
"completions/max_terminated_length": 2331.0,
"completions/mean_length": 829.42578125,
"completions/mean_terminated_length": 845.9482421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 493.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.004558868706226349,
"kl": 0.10372161865234375,
"learning_rate": 3.777777777777778e-06,
"loss": -0.0184,
"num_tokens": 21451866.0,
"reward": 0.8665769100189209,
"reward_std": 0.2548443377017975,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/asymmetric_l2_reward": 0.7345287799835205,
"rewards/final_brier_reward_step": 0.669718861579895,
"rewards/format_reward_step": 0.9765625,
"step": 64
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5004037267080745,
"calib/avg_num_step_conf": 9.6015625,
"calib/ece": 0.44124313725490194,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9921568627450981,
"calib/gap": 0.0013813664596273867,
"calib/mean_conf": 0.9902627450980392,
"calib/mu_c": 0.9908857142857143,
"calib/mu_w": 0.9895043478260869,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.44124313725490194,
"calib/std_conf": 0.009060882543352867,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9190794357832219,
"calib/step_q_c_n": 1347.0,
"calib/step_q_gap": 0.002451172956939307,
"calib/step_q_w": 0.9166282628262826,
"calib/step_q_w_n": 1111.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1578.0,
"completions/max_terminated_length": 1578.0,
"completions/mean_length": 767.11328125,
"completions/mean_terminated_length": 770.1216430664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 393.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.004670554306358099,
"kl": 0.105682373046875,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0159,
"num_tokens": 21753271.0,
"reward": 0.745876669883728,
"reward_std": 0.22282391786575317,
"rewards/accuracy_reward_step": 0.546875,
"rewards/asymmetric_l2_reward": 0.6341006755828857,
"rewards/final_brier_reward_step": 0.5514026880264282,
"rewards/format_reward_step": 0.984375,
"step": 65
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.48895463510848125,
"calib/avg_num_step_conf": 9.0859375,
"calib/ece": 0.5179024291497976,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0002717948717949614,
"calib/mean_conf": 0.9915866396761134,
"calib/mu_c": 0.9914435897435897,
"calib/mu_w": 0.9917153846153847,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.5179024291497976,
"calib/std_conf": 0.004608654906179873,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.917523611111111,
"calib/step_q_c_n": 1080.0,
"calib/step_q_gap": 0.008120721865525149,
"calib/step_q_w": 0.9094028892455859,
"calib/step_q_w_n": 1246.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3037.0,
"completions/max_terminated_length": 3037.0,
"completions/mean_length": 859.48046875,
"completions/mean_terminated_length": 887.2056274414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 485.0,
"epoch": 0.0704,
"grad_norm": 0.004522152245044708,
"kl": 0.10164642333984375,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.0549,
"num_tokens": 22079650.0,
"reward": 0.6742883920669556,
"reward_std": 0.24997103214263916,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/asymmetric_l2_reward": 0.5988451838493347,
"rewards/final_brier_reward_step": 0.4653565287590027,
"rewards/format_reward_step": 0.96484375,
"step": 66
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4814592503668134,
"calib/avg_num_step_conf": 8.80859375,
"calib/ece": 0.38200398406374503,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00022695744964651432,
"calib/mean_conf": 0.9915657370517929,
"calib/mu_c": 0.9914771241830065,
"calib/mu_w": 0.991704081632653,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.38200398406374503,
"calib/std_conf": 0.004837463914705453,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9192009943181819,
"calib/step_q_c_n": 1408.0,
"calib/step_q_gap": 0.0034536507526565785,
"calib/step_q_w": 0.9157473435655253,
"calib/step_q_w_n": 847.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2144.0,
"completions/max_terminated_length": 2144.0,
"completions/mean_length": 826.7578125,
"completions/mean_terminated_length": 836.561279296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 496.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.0044913021847605705,
"kl": 0.099365234375,
"learning_rate": 3.694444444444445e-06,
"loss": 0.0045,
"num_tokens": 22396308.0,
"reward": 0.7945924401283264,
"reward_std": 0.22359302639961243,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/asymmetric_l2_reward": 0.6705037355422974,
"rewards/final_brier_reward_step": 0.6038373708724976,
"rewards/format_reward_step": 0.9765625,
"step": 67
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.49789445979734176,
"calib/avg_num_step_conf": 8.57421875,
"calib/ece": 0.39864063745019934,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9960159362549801,
"calib/gap": -0.0008291354125539163,
"calib/mean_conf": 0.9906725099601594,
"calib/mu_c": 0.9903355704697987,
"calib/mu_w": 0.9911647058823526,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39784382470119534,
"calib/std_conf": 0.012954025404413391,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9185092524056256,
"calib/step_q_c_n": 1351.0,
"calib/step_q_gap": 0.00946245145775837,
"calib/step_q_w": 0.9090468009478673,
"calib/step_q_w_n": 844.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2807.0,
"completions/max_terminated_length": 2807.0,
"completions/mean_length": 836.859375,
"completions/mean_terminated_length": 846.7826538085938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 416.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.004729957785457373,
"kl": 0.0984649658203125,
"learning_rate": 3.6666666666666666e-06,
"loss": -0.0043,
"num_tokens": 22714632.0,
"reward": 0.788429319858551,
"reward_std": 0.2118925005197525,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.6775600910186768,
"rewards/final_brier_reward_step": 0.5883611440658569,
"rewards/format_reward_step": 0.97265625,
"step": 68
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.47797979797979806,
"calib/avg_num_step_conf": 8.078125,
"calib/ece": 0.44032202857142866,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0003832882154881645,
"calib/mean_conf": 0.9913424367346939,
"calib/mu_c": 0.991170348148148,
"calib/mu_w": 0.9915536363636361,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.44032202857142866,
"calib/std_conf": 0.0036035280865406505,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9141814665523157,
"calib/step_q_c_n": 1166.0,
"calib/step_q_gap": 0.003857741496883338,
"calib/step_q_w": 0.9103237250554324,
"calib/step_q_w_n": 902.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2661.0,
"completions/max_terminated_length": 2661.0,
"completions/mean_length": 869.47265625,
"completions/mean_terminated_length": 883.2738647460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 527.0,
"epoch": 0.0736,
"grad_norm": 0.004320142790675163,
"kl": 0.09468841552734375,
"learning_rate": 3.638888888888889e-06,
"loss": -0.0006,
"num_tokens": 23041713.0,
"reward": 0.7292081117630005,
"reward_std": 0.24731634557247162,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/asymmetric_l2_reward": 0.627022922039032,
"rewards/final_brier_reward_step": 0.5345181822776794,
"rewards/format_reward_step": 0.95703125,
"step": 69
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.49813928761297194,
"calib/avg_num_step_conf": 7.99609375,
"calib/ece": 0.4548939024390245,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00015917065390724794,
"calib/mean_conf": 0.9914792682926831,
"calib/mu_c": 0.9915530303030302,
"calib/mu_w": 0.991393859649123,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4548939024390245,
"calib/std_conf": 0.0041524330551632686,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9180130095403296,
"calib/step_q_c_n": 1153.0,
"calib/step_q_gap": 0.0100985800101282,
"calib/step_q_w": 0.9079144295302014,
"calib/step_q_w_n": 894.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3024.0,
"completions/max_terminated_length": 3024.0,
"completions/mean_length": 831.33984375,
"completions/mean_terminated_length": 847.9004516601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 383.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.004979654680937529,
"kl": 0.0870513916015625,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.007,
"num_tokens": 23361528.0,
"reward": 0.7205314636230469,
"reward_std": 0.25079619884490967,
"rewards/accuracy_reward_step": 0.515625,
"rewards/asymmetric_l2_reward": 0.6327803730964661,
"rewards/final_brier_reward_step": 0.5153137445449829,
"rewards/format_reward_step": 0.94921875,
"step": 70
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5466309391543036,
"calib/avg_num_step_conf": 7.71484375,
"calib/ece": 0.4190948000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0008683027253120201,
"calib/mean_conf": 0.9910948,
"calib/mu_c": 0.9914664335664335,
"calib/mu_w": 0.9905981308411215,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4190948000000001,
"calib/std_conf": 0.0031562276470495625,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9149893195521103,
"calib/step_q_c_n": 1161.0,
"calib/step_q_gap": 0.0079745775373683,
"calib/step_q_w": 0.907014742014742,
"calib/step_q_w_n": 814.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2727.0,
"completions/max_terminated_length": 2727.0,
"completions/mean_length": 842.80859375,
"completions/mean_terminated_length": 856.1865844726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 502.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.004616872873157263,
"kl": 0.08941650390625,
"learning_rate": 3.5833333333333335e-06,
"loss": 0.0087,
"num_tokens": 23681695.0,
"reward": 0.7680221796035767,
"reward_std": 0.28707748651504517,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/asymmetric_l2_reward": 0.6626472473144531,
"rewards/final_brier_reward_step": 0.5663659572601318,
"rewards/format_reward_step": 0.9765625,
"step": 71
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.45953630796150485,
"calib/avg_num_step_conf": 7.97265625,
"calib/ece": 0.4894138339920948,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0007374828146481383,
"calib/mean_conf": 0.991390118577075,
"calib/mu_c": 0.9910228346456692,
"calib/mu_w": 0.9917603174603173,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4894138339920948,
"calib/std_conf": 0.005447240769577733,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9165731593662628,
"calib/step_q_c_n": 1073.0,
"calib/step_q_gap": 0.005960659366262888,
"calib/step_q_w": 0.9106124999999999,
"calib/step_q_w_n": 968.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2172.0,
"completions/max_terminated_length": 2172.0,
"completions/mean_length": 789.16796875,
"completions/mean_terminated_length": 798.5256958007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 464.0,
"epoch": 0.0768,
"grad_norm": 0.004809282254427671,
"kl": 0.097625732421875,
"learning_rate": 3.555555555555556e-06,
"loss": -0.0322,
"num_tokens": 23988130.0,
"reward": 0.7388637065887451,
"reward_std": 0.25400206446647644,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/asymmetric_l2_reward": 0.6775314807891846,
"rewards/final_brier_reward_step": 0.5041021108627319,
"rewards/format_reward_step": 0.984375,
"step": 72
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4815950920245399,
"calib/avg_num_step_conf": 7.9140625,
"calib/ece": 0.34234183266932283,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.000287123535973266,
"calib/mean_conf": 0.9917442231075698,
"calib/mu_c": 0.9916435582822086,
"calib/mu_w": 0.9919306818181819,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34234183266932283,
"calib/std_conf": 0.003711937844814869,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9156873333333334,
"calib/step_q_c_n": 1350.0,
"calib/step_q_gap": 0.004211297830374883,
"calib/step_q_w": 0.9114760355029585,
"calib/step_q_w_n": 676.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2176.0,
"completions/max_terminated_length": 2176.0,
"completions/mean_length": 772.7890625,
"completions/mean_terminated_length": 781.9525756835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 454.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.0048720454797148705,
"kl": 0.08669281005859375,
"learning_rate": 3.5277777777777784e-06,
"loss": 0.0042,
"num_tokens": 24292996.0,
"reward": 0.8364192247390747,
"reward_std": 0.3165234923362732,
"rewards/accuracy_reward_step": 0.640625,
"rewards/asymmetric_l2_reward": 0.7072925567626953,
"rewards/final_brier_reward_step": 0.6421083211898804,
"rewards/format_reward_step": 0.9765625,
"step": 73
},
{
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.48536348501664806,
"calib/avg_num_step_conf": 7.81640625,
"calib/ece": 0.4293219008264464,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.9958677685950413,
"calib/gap": -0.0009002913429522197,
"calib/mean_conf": 0.9913053719008266,
"calib/mu_c": 0.9909110294117648,
"calib/mu_w": 0.991811320754717,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4293219008264464,
"calib/std_conf": 0.006998713877282736,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9165494362532525,
"calib/step_q_c_n": 1153.0,
"calib/step_q_gap": 0.008406275875894043,
"calib/step_q_w": 0.9081431603773584,
"calib/step_q_w_n": 848.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2647.0,
"completions/max_terminated_length": 2647.0,
"completions/mean_length": 807.8671875,
"completions/mean_terminated_length": 833.9273681640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 348.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.0050594923086464405,
"kl": 0.0992584228515625,
"learning_rate": 3.5e-06,
"loss": -0.0046,
"num_tokens": 24603738.0,
"reward": 0.7477249503135681,
"reward_std": 0.256036639213562,
"rewards/accuracy_reward_step": 0.53125,
"rewards/asymmetric_l2_reward": 0.6630828380584717,
"rewards/final_brier_reward_step": 0.5378357172012329,
"rewards/format_reward_step": 0.94140625,
"step": 74
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4979323308270677,
"calib/avg_num_step_conf": 8.22265625,
"calib/ece": 0.29454183266932255,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -5.233082706768144e-05,
"calib/mean_conf": 0.9917529880478086,
"calib/mu_c": 0.9917371428571428,
"calib/mu_w": 0.9917894736842104,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.29454183266932255,
"calib/std_conf": 0.0038501230171085607,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9169490274983233,
"calib/step_q_c_n": 1491.0,
"calib/step_q_gap": 0.007225900462492829,
"calib/step_q_w": 0.9097231270358305,
"calib/step_q_w_n": 614.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2305.0,
"completions/max_terminated_length": 2305.0,
"completions/mean_length": 767.90625,
"completions/mean_terminated_length": 783.2031860351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 495.0,
"epoch": 0.08,
"grad_norm": 0.008225214667618275,
"kl": 0.0906524658203125,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.0088,
"num_tokens": 24905074.0,
"reward": 0.8864485025405884,
"reward_std": 0.2618010640144348,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/asymmetric_l2_reward": 0.7516970038414001,
"rewards/final_brier_reward_step": 0.6883875131607056,
"rewards/format_reward_step": 0.98046875,
"step": 75
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5231930213237331,
"calib/avg_num_step_conf": 7.81640625,
"calib/ece": 0.36082730923694784,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00041622819163655844,
"calib/mean_conf": 0.9913493975903614,
"calib/mu_c": 0.9915031847133756,
"calib/mu_w": 0.9910869565217391,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36082730923694784,
"calib/std_conf": 0.0032884136442572405,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9175392670157068,
"calib/step_q_c_n": 1337.0,
"calib/step_q_gap": 0.011506134485586239,
"calib/step_q_w": 0.9060331325301205,
"calib/step_q_w_n": 664.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2962.0,
"completions/max_terminated_length": 2962.0,
"completions/mean_length": 802.3671875,
"completions/mean_terminated_length": 818.3506469726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 457.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.004342818632721901,
"kl": 0.082427978515625,
"learning_rate": 3.444444444444445e-06,
"loss": -0.0124,
"num_tokens": 25213536.0,
"reward": 0.8073495626449585,
"reward_std": 0.22171545028686523,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/asymmetric_l2_reward": 0.6834534406661987,
"rewards/final_brier_reward_step": 0.6156206130981445,
"rewards/format_reward_step": 0.96484375,
"step": 76
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.4841960434329912,
"calib/avg_num_step_conf": 7.52734375,
"calib/ece": 0.32991020408163263,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.494868362328372e-05,
"calib/mean_conf": 0.991134693877551,
"calib/mu_c": 0.9911296296296296,
"calib/mu_w": 0.9911445783132529,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.32991020408163263,
"calib/std_conf": 0.0038748562064384777,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.9157070552147238,
"calib/step_q_c_n": 1304.0,
"calib/step_q_gap": 0.009309949105399062,
"calib/step_q_w": 0.9063971061093248,
"calib/step_q_w_n": 622.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2973.0,
"completions/max_terminated_length": 2973.0,
"completions/mean_length": 796.47265625,
"completions/mean_terminated_length": 822.165283203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 393.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.004987195134162903,
"kl": 0.0777130126953125,
"learning_rate": 3.416666666666667e-06,
"loss": -0.0196,
"num_tokens": 25522097.0,
"reward": 0.816591739654541,
"reward_std": 0.272071897983551,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/asymmetric_l2_reward": 0.6926058530807495,
"rewards/final_brier_reward_step": 0.6265150904655457,
"rewards/format_reward_step": 0.9375,
"step": 77
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4534714523281596,
"calib/avg_num_step_conf": 7.14453125,
"calib/ece": 0.34036468253968244,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0008026330376940116,
"calib/mean_conf": 0.9911583333333333,
"calib/mu_c": 0.9908780487804878,
"calib/mu_w": 0.9916806818181818,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34036468253968244,
"calib/std_conf": 0.0032513077344550156,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.910915711947627,
"calib/step_q_c_n": 1222.0,
"calib/step_q_gap": 0.0065277383067702255,
"calib/step_q_w": 0.9043879736408568,
"calib/step_q_w_n": 607.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2379.0,
"completions/max_terminated_length": 2379.0,
"completions/mean_length": 841.29296875,
"completions/mean_terminated_length": 854.6468505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 449.0,
"epoch": 0.0832,
"grad_norm": 0.0048126475885510445,
"kl": 0.07703399658203125,
"learning_rate": 3.3888888888888893e-06,
"loss": -0.0472,
"num_tokens": 25845492.0,
"reward": 0.8382173776626587,
"reward_std": 0.19039109349250793,
"rewards/accuracy_reward_step": 0.640625,
"rewards/asymmetric_l2_reward": 0.7068257331848145,
"rewards/final_brier_reward_step": 0.6461716294288635,
"rewards/format_reward_step": 0.9765625,
"step": 78
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4819757842597689,
"calib/avg_num_step_conf": 7.41796875,
"calib/ece": 0.35984720000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00037764171711607286,
"calib/mean_conf": 0.9918472,
"calib/mu_c": 0.9917082278481012,
"calib/mu_w": 0.9920858695652173,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.35984720000000003,
"calib/std_conf": 0.00402043929938011,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9162189516129032,
"calib/step_q_c_n": 1240.0,
"calib/step_q_gap": 0.007129422022614906,
"calib/step_q_w": 0.9090895295902883,
"calib/step_q_w_n": 659.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2280.0,
"completions/max_terminated_length": 2280.0,
"completions/mean_length": 811.25390625,
"completions/mean_terminated_length": 827.4143676757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 497.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.004438234027475119,
"kl": 0.070159912109375,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0114,
"num_tokens": 26159549.0,
"reward": 0.8238353729248047,
"reward_std": 0.22975537180900574,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/asymmetric_l2_reward": 0.7078436017036438,
"rewards/final_brier_reward_step": 0.6226396560668945,
"rewards/format_reward_step": 0.96875,
"step": 79
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.4900687547746371,
"calib/avg_num_step_conf": 7.953125,
"calib/ece": 0.302919028340081,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00019404125286448526,
"calib/mean_conf": 0.9911781376518218,
"calib/mu_c": 0.9911176470588235,
"calib/mu_w": 0.991311688311688,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.302919028340081,
"calib/std_conf": 0.003187883147295252,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9157367272727273,
"calib/step_q_c_n": 1375.0,
"calib/step_q_gap": 0.0034265911153900097,
"calib/step_q_w": 0.9123101361573372,
"calib/step_q_w_n": 661.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2978.0,
"completions/max_terminated_length": 2978.0,
"completions/mean_length": 773.28125,
"completions/mean_terminated_length": 795.0200805664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 534.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.005505390930920839,
"kl": 0.080474853515625,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0005,
"num_tokens": 26459669.0,
"reward": 0.8524667024612427,
"reward_std": 0.2551957368850708,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/asymmetric_l2_reward": 0.7146350741386414,
"rewards/final_brier_reward_step": 0.665298342704773,
"rewards/format_reward_step": 0.9609375,
"step": 80
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5199259581881535,
"calib/avg_num_step_conf": 7.8671875,
"calib/ece": 0.318744,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.000526422764227652,
"calib/mean_conf": 0.9907440000000001,
"calib/mu_c": 0.9909166666666666,
"calib/mu_w": 0.9903902439024389,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.318744,
"calib/std_conf": 0.00405345087548869,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9207250177179306,
"calib/step_q_c_n": 1411.0,
"calib/step_q_gap": 0.013496161996537581,
"calib/step_q_w": 0.907228855721393,
"calib/step_q_w_n": 603.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2670.0,
"completions/max_terminated_length": 2670.0,
"completions/mean_length": 819.91015625,
"completions/mean_terminated_length": 829.6324462890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 470.0,
"epoch": 0.0864,
"grad_norm": 0.004928308539092541,
"kl": 0.07088470458984375,
"learning_rate": 3.3055555555555558e-06,
"loss": -0.0144,
"num_tokens": 26775814.0,
"reward": 0.8462182879447937,
"reward_std": 0.25106942653656006,
"rewards/accuracy_reward_step": 0.65625,
"rewards/asymmetric_l2_reward": 0.7091135382652283,
"rewards/final_brier_reward_step": 0.6583229303359985,
"rewards/format_reward_step": 0.96875,
"step": 81
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5263157894736842,
"calib/avg_num_step_conf": 7.92578125,
"calib/ece": 0.370248,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0009602716468593009,
"calib/mean_conf": 0.990248,
"calib/mu_c": 0.9906129032258065,
"calib/mu_w": 0.9896526315789472,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.370248,
"calib/std_conf": 0.003966168932357777,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9210427018633541,
"calib/step_q_c_n": 1288.0,
"calib/step_q_gap": 0.010635144508428285,
"calib/step_q_w": 0.9104075573549258,
"calib/step_q_w_n": 741.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2816.0,
"completions/max_terminated_length": 2816.0,
"completions/mean_length": 784.0390625,
"completions/mean_terminated_length": 796.4841918945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 537.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.004967535380274057,
"kl": 0.07586669921875,
"learning_rate": 3.277777777777778e-06,
"loss": 0.0084,
"num_tokens": 27082080.0,
"reward": 0.8055253028869629,
"reward_std": 0.19441071152687073,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/asymmetric_l2_reward": 0.6871503591537476,
"rewards/final_brier_reward_step": 0.6090565919876099,
"rewards/format_reward_step": 0.96875,
"step": 82
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.482809588072746,
"calib/avg_num_step_conf": 7.84375,
"calib/ece": 0.4586956000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0001689287320867372,
"calib/mean_conf": 0.9906956000000001,
"calib/mu_c": 0.9906165413533833,
"calib/mu_w": 0.99078547008547,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4586956000000001,
"calib/std_conf": 0.003282100035038545,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9204350940017905,
"calib/step_q_c_n": 1117.0,
"calib/step_q_gap": 0.00724429714432695,
"calib/step_q_w": 0.9131907968574635,
"calib/step_q_w_n": 891.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1702.0,
"completions/max_terminated_length": 1702.0,
"completions/mean_length": 820.61328125,
"completions/mean_terminated_length": 836.960205078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 526.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.004836047068238258,
"kl": 0.07021331787109375,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.0354,
"num_tokens": 27399421.0,
"reward": 0.7269289493560791,
"reward_std": 0.18077202141284943,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/asymmetric_l2_reward": 0.6259989738464355,
"rewards/final_brier_reward_step": 0.5278588533401489,
"rewards/format_reward_step": 0.9765625,
"step": 83
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4943981240229286,
"calib/avg_num_step_conf": 8.1953125,
"calib/ece": 0.38960474308300397,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -8.370244919175818e-05,
"calib/mean_conf": 0.990395256916996,
"calib/mu_c": 0.9903618421052633,
"calib/mu_w": 0.9904455445544551,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.38960474308300397,
"calib/std_conf": 0.002242664201469968,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9229286775631502,
"calib/step_q_c_n": 1346.0,
"calib/step_q_gap": 0.013153411605703491,
"calib/step_q_w": 0.9097752659574467,
"calib/step_q_w_n": 752.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3011.0,
"completions/max_terminated_length": 3011.0,
"completions/mean_length": 787.5703125,
"completions/mean_terminated_length": 793.7716674804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 491.0,
"epoch": 0.0896,
"grad_norm": 0.005793274845927954,
"kl": 0.07592010498046875,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.0023,
"num_tokens": 27706959.0,
"reward": 0.8168776035308838,
"reward_std": 0.2131912112236023,
"rewards/accuracy_reward_step": 0.59375,
"rewards/asymmetric_l2_reward": 0.7161560654640198,
"rewards/final_brier_reward_step": 0.6011929512023926,
"rewards/format_reward_step": 0.98828125,
"step": 84
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.4797160243407708,
"calib/avg_num_step_conf": 8.15234375,
"calib/ece": 0.40364372469635623,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.000364029749830852,
"calib/mean_conf": 0.9906882591093117,
"calib/mu_c": 0.9905379310344826,
"calib/mu_w": 0.9909019607843135,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.40364372469635623,
"calib/std_conf": 0.00261805932345344,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9229945355191257,
"calib/step_q_c_n": 1281.0,
"calib/step_q_gap": 0.012014386635751051,
"calib/step_q_w": 0.9109801488833746,
"calib/step_q_w_n": 806.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2815.0,
"completions/max_terminated_length": 2815.0,
"completions/mean_length": 850.41015625,
"completions/mean_terminated_length": 863.9087524414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 495.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.0050011295825243,
"kl": 0.0685577392578125,
"learning_rate": 3.1944444444444443e-06,
"loss": 0.0118,
"num_tokens": 28032488.0,
"reward": 0.7705468535423279,
"reward_std": 0.22396138310432434,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/asymmetric_l2_reward": 0.6612777709960938,
"rewards/final_brier_reward_step": 0.5735659599304199,
"rewards/format_reward_step": 0.96484375,
"step": 85
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.47517161737345226,
"calib/avg_num_step_conf": 8.1015625,
"calib/ece": 0.42288492063492056,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00032931288894577637,
"calib/mean_conf": 0.990345238095238,
"calib/mu_c": 0.9902027972027971,
"calib/mu_w": 0.9905321100917429,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.42288492063492056,
"calib/std_conf": 0.0027767203202630013,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9197145187601959,
"calib/step_q_c_n": 1226.0,
"calib/step_q_gap": 0.006379613099818493,
"calib/step_q_w": 0.9133349056603774,
"calib/step_q_w_n": 848.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2963.0,
"completions/max_terminated_length": 2963.0,
"completions/mean_length": 850.4453125,
"completions/mean_terminated_length": 860.5296630859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 542.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.005102749913930893,
"kl": 0.07195281982421875,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.0102,
"num_tokens": 28355714.0,
"reward": 0.7638604640960693,
"reward_std": 0.23303180932998657,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/asymmetric_l2_reward": 0.6525702476501465,
"rewards/final_brier_reward_step": 0.5665569305419922,
"rewards/format_reward_step": 0.984375,
"step": 86
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.521505376344086,
"calib/avg_num_step_conf": 8.33203125,
"calib/ece": 0.237246963562753,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0003763440860211631,
"calib/mean_conf": 0.9902834008097166,
"calib/mu_c": 0.9903763440860214,
"calib/mu_w": 0.9900000000000002,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.237246963562753,
"calib/std_conf": 0.0015613784309172973,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9218906832298137,
"calib/step_q_c_n": 1610.0,
"calib/step_q_gap": 0.005447088583542148,
"calib/step_q_w": 0.9164435946462716,
"calib/step_q_w_n": 523.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2970.0,
"completions/max_terminated_length": 2970.0,
"completions/mean_length": 805.390625,
"completions/mean_terminated_length": 831.3709716796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 532.0,
"epoch": 0.0928,
"grad_norm": 0.0050618527457118034,
"kl": 0.07224273681640625,
"learning_rate": 3.138888888888889e-06,
"loss": -0.0338,
"num_tokens": 28667390.0,
"reward": 0.9251996874809265,
"reward_std": 0.23489241302013397,
"rewards/accuracy_reward_step": 0.7265625,
"rewards/asymmetric_l2_reward": 0.7817424535751343,
"rewards/final_brier_reward_step": 0.7311569452285767,
"rewards/format_reward_step": 0.9609375,
"step": 87
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4774013402829486,
"calib/avg_num_step_conf": 7.7109375,
"calib/ece": 0.3074859437751004,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00014586746090783365,
"calib/mean_conf": 0.9902168674698796,
"calib/mu_c": 0.9901705882352941,
"calib/mu_w": 0.990316455696202,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.3074859437751004,
"calib/std_conf": 0.0025930778696092926,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.9233775071633237,
"calib/step_q_c_n": 1396.0,
"calib/step_q_gap": 0.012166434498963707,
"calib/step_q_w": 0.91121107266436,
"calib/step_q_w_n": 578.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1921.0,
"completions/max_terminated_length": 1921.0,
"completions/mean_length": 837.984375,
"completions/mean_terminated_length": 851.2857666015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.0049079423770308495,
"kl": 0.06745147705078125,
"learning_rate": 3.1111111111111116e-06,
"loss": -0.0159,
"num_tokens": 28991762.0,
"reward": 0.8723215460777283,
"reward_std": 0.21739079058170319,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/asymmetric_l2_reward": 0.7529059648513794,
"rewards/final_brier_reward_step": 0.6659557819366455,
"rewards/format_reward_step": 0.96484375,
"step": 88
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.4989224137931034,
"calib/avg_num_step_conf": 7.87890625,
"calib/ece": 0.4656352459016394,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.4008620689454077e-05,
"calib/mean_conf": 0.9902254098360657,
"calib/mu_c": 0.9902187500000001,
"calib/mu_w": 0.9902327586206896,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4656352459016394,
"calib/std_conf": 0.0019126439290279625,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9246768128916741,
"calib/step_q_c_n": 1117.0,
"calib/step_q_gap": 0.013053479558340797,
"calib/step_q_w": 0.9116233333333333,
"calib/step_q_w_n": 900.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2996.0,
"completions/max_terminated_length": 2996.0,
"completions/mean_length": 876.921875,
"completions/mean_terminated_length": 905.2096557617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 488.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.004340740852057934,
"kl": 0.06580352783203125,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.0493,
"num_tokens": 29325142.0,
"reward": 0.7178754806518555,
"reward_std": 0.19721011817455292,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/asymmetric_l2_reward": 0.6355875730514526,
"rewards/final_brier_reward_step": 0.5087569952011108,
"rewards/format_reward_step": 0.953125,
"step": 89
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.49875539937037855,
"calib/avg_num_step_conf": 8.67578125,
"calib/ece": 0.3468770491803278,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -2.1231422504985886e-05,
"calib/mean_conf": 0.9903196721311475,
"calib/mu_c": 0.990312101910828,
"calib/mu_w": 0.990333333333333,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3468770491803278,
"calib/std_conf": 0.0017380335290896219,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.924378156996587,
"calib/step_q_c_n": 1465.0,
"calib/step_q_gap": 0.0031969400653700797,
"calib/step_q_w": 0.9211812169312169,
"calib/step_q_w_n": 756.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2461.0,
"completions/max_terminated_length": 2461.0,
"completions/mean_length": 826.85546875,
"completions/mean_terminated_length": 863.9795532226562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 336.0,
"epoch": 0.096,
"grad_norm": 0.004711349029093981,
"kl": 0.070892333984375,
"learning_rate": 3.055555555555556e-06,
"loss": -0.0575,
"num_tokens": 29640137.0,
"reward": 0.8198519945144653,
"reward_std": 0.2870404124259949,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/asymmetric_l2_reward": 0.7075222134590149,
"rewards/final_brier_reward_step": 0.6196815967559814,
"rewards/format_reward_step": 0.94921875,
"step": 90
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.4888357825652732,
"calib/avg_num_step_conf": 8.2734375,
"calib/ece": 0.3355742971887552,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00020288200884555074,
"calib/mean_conf": 0.9901927710843376,
"calib/mu_c": 0.990122699386503,
"calib/mu_w": 0.9903255813953485,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3355742971887552,
"calib/std_conf": 0.0013484290844498997,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.9262064156206417,
"calib/step_q_c_n": 1434.0,
"calib/step_q_gap": 0.01088770217034929,
"calib/step_q_w": 0.9153187134502924,
"calib/step_q_w_n": 684.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 1977.0,
"completions/max_terminated_length": 1977.0,
"completions/mean_length": 870.3046875,
"completions/mean_terminated_length": 891.1920166015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 581.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.005069150123745203,
"kl": 0.068084716796875,
"learning_rate": 3.0277777777777776e-06,
"loss": -0.0399,
"num_tokens": 29970647.0,
"reward": 0.8304876089096069,
"reward_std": 0.24938277900218964,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/asymmetric_l2_reward": 0.707068920135498,
"rewards/final_brier_reward_step": 0.6351562738418579,
"rewards/format_reward_step": 0.95703125,
"step": 91
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5065629904408617,
"calib/avg_num_step_conf": 9.01171875,
"calib/ece": 0.3352449799196787,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9959839357429718,
"calib/gap": -0.00036845484377190374,
"calib/mean_conf": 0.9898634538152611,
"calib/mu_c": 0.9897361963190184,
"calib/mu_w": 0.9901046511627903,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3352449799196787,
"calib/std_conf": 0.0058835272831902164,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9270372062663186,
"calib/step_q_c_n": 1532.0,
"calib/step_q_gap": 0.006266883685673408,
"calib/step_q_w": 0.9207703225806452,
"calib/step_q_w_n": 775.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2705.0,
"completions/max_terminated_length": 2705.0,
"completions/mean_length": 818.80078125,
"completions/mean_terminated_length": 838.4520263671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 582.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.005148807540535927,
"kl": 0.07390594482421875,
"learning_rate": 3e-06,
"loss": -0.0121,
"num_tokens": 30286980.0,
"reward": 0.8529649972915649,
"reward_std": 0.20928457379341125,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/asymmetric_l2_reward": 0.740821361541748,
"rewards/final_brier_reward_step": 0.6432335376739502,
"rewards/format_reward_step": 0.97265625,
"step": 92
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.487980275323608,
"calib/avg_num_step_conf": 8.75390625,
"calib/ece": 0.36226800000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0002238887747415541,
"calib/mean_conf": 0.990268,
"calib/mu_c": 0.9901847133757958,
"calib/mu_w": 0.9904086021505374,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36226800000000003,
"calib/std_conf": 0.0018166386542182804,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9278586206896552,
"calib/step_q_c_n": 1450.0,
"calib/step_q_gap": 0.004825750904572934,
"calib/step_q_w": 0.9230328697850823,
"calib/step_q_w_n": 791.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3050.0,
"completions/max_terminated_length": 3050.0,
"completions/mean_length": 858.32421875,
"completions/mean_terminated_length": 871.948486328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 562.0,
"epoch": 0.0992,
"grad_norm": 0.005295910406857729,
"kl": 0.07146453857421875,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.0174,
"num_tokens": 30612487.0,
"reward": 0.8195692300796509,
"reward_std": 0.25328242778778076,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/asymmetric_l2_reward": 0.7010154128074646,
"rewards/final_brier_reward_step": 0.6201542615890503,
"rewards/format_reward_step": 0.9765625,
"step": 93
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.4979994481236203,
"calib/avg_num_step_conf": 8.26953125,
"calib/ece": 0.37865991902834006,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.000177014348786253,
"calib/mean_conf": 0.989995951417004,
"calib/mu_c": 0.9899271523178805,
"calib/mu_w": 0.9901041666666668,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.37865991902834006,
"calib/std_conf": 0.0023645457759122867,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9265103397341212,
"calib/step_q_c_n": 1354.0,
"calib/step_q_gap": 0.012775084164003192,
"calib/step_q_w": 0.913735255570118,
"calib/step_q_w_n": 763.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3012.0,
"completions/max_terminated_length": 3012.0,
"completions/mean_length": 871.8046875,
"completions/mean_terminated_length": 889.1713256835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 571.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.0050525604747235775,
"kl": 0.0719146728515625,
"learning_rate": 2.944444444444445e-06,
"loss": -0.0031,
"num_tokens": 30944349.0,
"reward": 0.7948108911514282,
"reward_std": 0.20116651058197021,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/asymmetric_l2_reward": 0.6823795437812805,
"rewards/final_brier_reward_step": 0.5970859527587891,
"rewards/format_reward_step": 0.9609375,
"step": 94
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.514673311184939,
"calib/avg_num_step_conf": 8.80859375,
"calib/ece": 0.32853937007874,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.000989341085271489,
"calib/mean_conf": 0.9899566929133857,
"calib/mu_c": 0.9902916666666667,
"calib/mu_w": 0.9893023255813952,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32853937007874,
"calib/std_conf": 0.004645090381653441,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9304301007556676,
"calib/step_q_c_n": 1588.0,
"calib/step_q_gap": 0.017356637487301763,
"calib/step_q_w": 0.9130734632683658,
"calib/step_q_w_n": 667.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2072.0,
"completions/max_terminated_length": 2072.0,
"completions/mean_length": 832.890625,
"completions/mean_terminated_length": 839.4487915039062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 482.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.005352802574634552,
"kl": 0.0714111328125,
"learning_rate": 2.916666666666667e-06,
"loss": 0.0029,
"num_tokens": 31263697.0,
"reward": 0.8585090637207031,
"reward_std": 0.22589869797229767,
"rewards/accuracy_reward_step": 0.65625,
"rewards/asymmetric_l2_reward": 0.7240145206451416,
"rewards/final_brier_reward_step": 0.663316011428833,
"rewards/format_reward_step": 0.9921875,
"step": 95
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5102040816326531,
"calib/avg_num_step_conf": 8.4296875,
"calib/ece": 0.18564000000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.996,
"calib/gap": 0.001836734693877351,
"calib/mean_conf": 0.9896400000000001,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9881632653061226,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18564000000000003,
"calib/std_conf": 0.005819828176157777,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9281357818586534,
"calib/step_q_c_n": 1797.0,
"calib/step_q_gap": 0.016833842800481613,
"calib/step_q_w": 0.9113019390581718,
"calib/step_q_w_n": 361.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3063.0,
"completions/max_terminated_length": 3063.0,
"completions/mean_length": 784.93359375,
"completions/mean_terminated_length": 791.1141967773438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.1024,
"grad_norm": 0.005755927413702011,
"kl": 0.0801849365234375,
"learning_rate": 2.888888888888889e-06,
"loss": -0.0024,
"num_tokens": 31570456.0,
"reward": 0.9797643423080444,
"reward_std": 0.14108261466026306,
"rewards/accuracy_reward_step": 0.78515625,
"rewards/asymmetric_l2_reward": 0.8193533420562744,
"rewards/final_brier_reward_step": 0.7893941402435303,
"rewards/format_reward_step": 0.96875,
"step": 96
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.49388794567062816,
"calib/avg_num_step_conf": 8.42578125,
"calib/ece": 0.3702320000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00011816638370087951,
"calib/mean_conf": 0.9902320000000001,
"calib/mu_c": 0.9901870967741935,
"calib/mu_w": 0.9903052631578944,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3702320000000001,
"calib/std_conf": 0.0014812751263691716,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9278878157503714,
"calib/step_q_c_n": 1346.0,
"calib/step_q_gap": 0.006870553111653854,
"calib/step_q_w": 0.9210172626387175,
"calib/step_q_w_n": 811.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2962.0,
"completions/max_terminated_length": 2962.0,
"completions/mean_length": 834.37109375,
"completions/mean_terminated_length": 847.6151123046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 387.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.007479469757527113,
"kl": 0.07452392578125,
"learning_rate": 2.861111111111111e-06,
"loss": -0.0055,
"num_tokens": 31889127.0,
"reward": 0.798133134841919,
"reward_std": 0.30348360538482666,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/asymmetric_l2_reward": 0.6690092086791992,
"rewards/final_brier_reward_step": 0.612413227558136,
"rewards/format_reward_step": 0.96875,
"step": 97
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5033557046979866,
"calib/avg_num_step_conf": 8.08203125,
"calib/ece": 0.38186938775510204,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 5.3691275167522257e-05,
"calib/mean_conf": 0.9900326530612245,
"calib/mu_c": 0.9900536912751676,
"calib/mu_w": 0.9900000000000001,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.38186938775510204,
"calib/std_conf": 0.000510057121691864,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.923427536231884,
"calib/step_q_c_n": 1242.0,
"calib/step_q_gap": 0.0032098820601790257,
"calib/step_q_w": 0.920217654171705,
"calib/step_q_w_n": 827.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2976.0,
"completions/max_terminated_length": 2976.0,
"completions/mean_length": 822.546875,
"completions/mean_terminated_length": 845.670654296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 408.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.0054461113177239895,
"kl": 0.0721435546875,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.0025,
"num_tokens": 32205883.0,
"reward": 0.7846657037734985,
"reward_std": 0.2299382984638214,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.6729419827461243,
"rewards/final_brier_reward_step": 0.5893582105636597,
"rewards/format_reward_step": 0.953125,
"step": 98
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4964211348256116,
"calib/avg_num_step_conf": 7.6640625,
"calib/ece": 0.5362168674698796,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -5.8823529411999864e-05,
"calib/mean_conf": 0.9900321285140563,
"calib/mu_c": 0.99,
"calib/mu_w": 0.990058823529412,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.5362168674698796,
"calib/std_conf": 0.0016302154142128443,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9243573667711599,
"calib/step_q_c_n": 957.0,
"calib/step_q_gap": 0.007462839407975763,
"calib/step_q_w": 0.9168945273631841,
"calib/step_q_w_n": 1005.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3060.0,
"completions/max_terminated_length": 3060.0,
"completions/mean_length": 867.2109375,
"completions/mean_terminated_length": 888.0240478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.1056,
"grad_norm": 0.0054686726070940495,
"kl": 0.06401824951171875,
"learning_rate": 2.805555555555556e-06,
"loss": -0.035,
"num_tokens": 32533689.0,
"reward": 0.6467150449752808,
"reward_std": 0.30831170082092285,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/asymmetric_l2_reward": 0.5605430006980896,
"rewards/final_brier_reward_step": 0.4516370892524719,
"rewards/format_reward_step": 0.96484375,
"step": 99
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5060975609756098,
"calib/avg_num_step_conf": 7.73828125,
"calib/ece": 0.32053061224489776,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 6.24811803673353e-05,
"calib/mean_conf": 0.9899183673469386,
"calib/mu_c": 0.9899390243902438,
"calib/mu_w": 0.9898765432098765,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.32053061224489776,
"calib/std_conf": 0.0015627909974756458,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9257551896921975,
"calib/step_q_c_n": 1397.0,
"calib/step_q_gap": 0.018169573253841387,
"calib/step_q_w": 0.9075856164383561,
"calib/step_q_w_n": 584.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2148.0,
"completions/max_terminated_length": 2148.0,
"completions/mean_length": 847.85546875,
"completions/mean_terminated_length": 871.6907348632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 566.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.005433334968984127,
"kl": 0.06568145751953125,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.0017,
"num_tokens": 32858148.0,
"reward": 0.8413238525390625,
"reward_std": 0.1991977095603943,
"rewards/accuracy_reward_step": 0.640625,
"rewards/asymmetric_l2_reward": 0.7170438766479492,
"rewards/final_brier_reward_step": 0.6468539237976074,
"rewards/format_reward_step": 0.953125,
"step": 100
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.49193548387096775,
"calib/avg_num_step_conf": 7.47265625,
"calib/ece": 0.48804819277108447,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00011290322580648215,
"calib/mean_conf": 0.9900562248995985,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9901129032258064,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.48804819277108447,
"calib/std_conf": 0.0006500319776058185,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9235257731958764,
"calib/step_q_c_n": 970.0,
"calib/step_q_gap": 0.007631817734582658,
"calib/step_q_w": 0.9158939554612937,
"calib/step_q_w_n": 943.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2855.0,
"completions/max_terminated_length": 2855.0,
"completions/mean_length": 897.859375,
"completions/mean_terminated_length": 908.5059814453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 496.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.005266252439469099,
"kl": 0.06288909912109375,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0061,
"num_tokens": 33194992.0,
"reward": 0.696447491645813,
"reward_std": 0.26901131868362427,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/asymmetric_l2_reward": 0.6076316833496094,
"rewards/final_brier_reward_step": 0.49385690689086914,
"rewards/format_reward_step": 0.96875,
"step": 101
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5056179775280899,
"calib/avg_num_step_conf": 8.453125,
"calib/ece": 0.29203921568627444,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00011235955056221325,
"calib/mean_conf": 0.9900784313725489,
"calib/mu_c": 0.9901123595505619,
"calib/mu_w": 0.9899999999999997,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.29203921568627444,
"calib/std_conf": 0.0012499903882752347,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9253944836433612,
"calib/step_q_c_n": 1559.0,
"calib/step_q_gap": 0.009543243973939686,
"calib/step_q_w": 0.9158512396694215,
"calib/step_q_w_n": 605.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2082.0,
"completions/max_terminated_length": 2082.0,
"completions/mean_length": 804.48828125,
"completions/mean_terminated_length": 807.6431884765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 464.0,
"epoch": 0.1088,
"grad_norm": 0.005804200656712055,
"kl": 0.0684051513671875,
"learning_rate": 2.7222222222222224e-06,
"loss": -0.0227,
"num_tokens": 33507637.0,
"reward": 0.8951650857925415,
"reward_std": 0.1038808822631836,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/asymmetric_l2_reward": 0.7516794204711914,
"rewards/final_brier_reward_step": 0.7011507749557495,
"rewards/format_reward_step": 0.9921875,
"step": 102
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.49127270157586034,
"calib/avg_num_step_conf": 7.22265625,
"calib/ece": 0.34886693548387115,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00016401667726673175,
"calib/mean_conf": 0.9899959677419357,
"calib/mu_c": 0.9899371069182389,
"calib/mu_w": 0.9901011235955056,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.34886693548387115,
"calib/std_conf": 0.0008542964652697105,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9279089506172841,
"calib/step_q_c_n": 1296.0,
"calib/step_q_gap": 0.0188962694578636,
"calib/step_q_w": 0.9090126811594205,
"calib/step_q_w_n": 552.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 1816.0,
"completions/max_terminated_length": 1816.0,
"completions/mean_length": 886.0703125,
"completions/mean_terminated_length": 907.3360595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 529.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.004914054647088051,
"kl": 0.05438232421875,
"learning_rate": 2.6944444444444444e-06,
"loss": -0.0343,
"num_tokens": 33839023.0,
"reward": 0.8248037099838257,
"reward_std": 0.1900019645690918,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/asymmetric_l2_reward": 0.7054777145385742,
"rewards/final_brier_reward_step": 0.6277234554290771,
"rewards/format_reward_step": 0.9609375,
"step": 103
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5124609578344612,
"calib/avg_num_step_conf": 7.74609375,
"calib/ece": 0.44361445783132536,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0006045028630921045,
"calib/mean_conf": 0.9897991967871487,
"calib/mu_c": 0.9900735294117647,
"calib/mu_w": 0.9894690265486726,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.44361445783132536,
"calib/std_conf": 0.002755031788938556,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.9233913043478261,
"calib/step_q_c_n": 1150.0,
"calib/step_q_gap": 0.007376898585521263,
"calib/step_q_w": 0.9160144057623049,
"calib/step_q_w_n": 833.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1909.0,
"completions/max_terminated_length": 1909.0,
"completions/mean_length": 839.453125,
"completions/mean_terminated_length": 852.77783203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 315.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.005728873424232006,
"kl": 0.06304931640625,
"learning_rate": 2.666666666666667e-06,
"loss": -0.0487,
"num_tokens": 34160603.0,
"reward": 0.7302390336990356,
"reward_std": 0.22601860761642456,
"rewards/accuracy_reward_step": 0.53125,
"rewards/asymmetric_l2_reward": 0.6333199143409729,
"rewards/final_brier_reward_step": 0.5287206768989563,
"rewards/format_reward_step": 0.9609375,
"step": 104
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5056179775280899,
"calib/avg_num_step_conf": 7.75,
"calib/ece": 0.3445418326693228,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00011235955056165814,
"calib/mean_conf": 0.9899601593625499,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9898876404494383,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3445418326693228,
"calib/std_conf": 0.0006299357888781638,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9251345454545454,
"calib/step_q_c_n": 1375.0,
"calib/step_q_gap": 0.014510571727123112,
"calib/step_q_w": 0.9106239737274223,
"calib/step_q_w_n": 609.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2800.0,
"completions/max_terminated_length": 2800.0,
"completions/mean_length": 870.8203125,
"completions/mean_terminated_length": 881.1463012695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 597.0,
"epoch": 0.112,
"grad_norm": 0.0053633954375982285,
"kl": 0.057952880859375,
"learning_rate": 2.6388888888888893e-06,
"loss": 0.0069,
"num_tokens": 34489293.0,
"reward": 0.8335608243942261,
"reward_std": 0.22055912017822266,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/asymmetric_l2_reward": 0.7094079256057739,
"rewards/final_brier_reward_step": 0.6358386278152466,
"rewards/format_reward_step": 0.9765625,
"step": 105
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.08984375,
"calib/ece": 0.42200000000000015,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.42200000000000015,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9289254210104251,
"calib/step_q_c_n": 1247.0,
"calib/step_q_gap": 0.009592896738580436,
"calib/step_q_w": 0.9193325242718446,
"calib/step_q_w_n": 824.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2916.0,
"completions/max_terminated_length": 2916.0,
"completions/mean_length": 874.92578125,
"completions/mean_terminated_length": 885.3004150390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 609.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.005669588688760996,
"kl": 0.06478118896484375,
"learning_rate": 2.6111111111111113e-06,
"loss": 0.0109,
"num_tokens": 34817858.0,
"reward": 0.7706383466720581,
"reward_std": 0.1852344423532486,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/asymmetric_l2_reward": 0.672858476638794,
"rewards/final_brier_reward_step": 0.562949538230896,
"rewards/format_reward_step": 0.97265625,
"step": 106
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4969715956558062,
"calib/avg_num_step_conf": 8.6015625,
"calib/ece": 0.3194901960784313,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -6.0568086883594496e-05,
"calib/mean_conf": 0.9900784313725489,
"calib/mu_c": 0.9900584795321639,
"calib/mu_w": 0.9901190476190475,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3194901960784313,
"calib/std_conf": 0.0008821350493491768,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9264769647696477,
"calib/step_q_c_n": 1476.0,
"calib/step_q_gap": 0.0017937691773610709,
"calib/step_q_w": 0.9246831955922866,
"calib/step_q_w_n": 726.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1709.0,
"completions/max_terminated_length": 1709.0,
"completions/mean_length": 837.74609375,
"completions/mean_terminated_length": 841.0314331054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 533.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.005860211793333292,
"kl": 0.065277099609375,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.008,
"num_tokens": 35136937.0,
"reward": 0.8771469593048096,
"reward_std": 0.20320641994476318,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/asymmetric_l2_reward": 0.7518141269683838,
"rewards/final_brier_reward_step": 0.6704484224319458,
"rewards/format_reward_step": 0.9921875,
"step": 107
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5130434782608695,
"calib/avg_num_step_conf": 8.34375,
"calib/ece": 0.2510803212851407,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0002571070234114181,
"calib/mean_conf": 0.9900361445783133,
"calib/mu_c": 0.9901032608695653,
"calib/mu_w": 0.9898461538461539,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2510803212851407,
"calib/std_conf": 0.0010617003464647918,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.928213880126183,
"calib/step_q_c_n": 1585.0,
"calib/step_q_gap": 0.008195731305856269,
"calib/step_q_w": 0.9200181488203267,
"calib/step_q_w_n": 551.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 1848.0,
"completions/max_terminated_length": 1848.0,
"completions/mean_length": 858.20703125,
"completions/mean_terminated_length": 878.8040161132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 542.0,
"epoch": 0.1152,
"grad_norm": 0.0051793381571769714,
"kl": 0.06256103515625,
"learning_rate": 2.5555555555555557e-06,
"loss": -0.0585,
"num_tokens": 35459870.0,
"reward": 0.9190077781677246,
"reward_std": 0.19223986566066742,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/asymmetric_l2_reward": 0.7760034203529358,
"rewards/final_brier_reward_step": 0.7237308025360107,
"rewards/format_reward_step": 0.96875,
"step": 108
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5042735042735043,
"calib/avg_num_step_conf": 7.69921875,
"calib/ece": 0.46165322580645163,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0002564102564103221,
"calib/mean_conf": 0.9898790322580645,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9897435897435897,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.46165322580645163,
"calib/std_conf": 0.0019011572958268217,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9315300084530854,
"calib/step_q_c_n": 1183.0,
"calib/step_q_gap": 0.01933457698100427,
"calib/step_q_w": 0.9121954314720812,
"calib/step_q_w_n": 788.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2762.0,
"completions/max_terminated_length": 2762.0,
"completions/mean_length": 899.1953125,
"completions/mean_terminated_length": 917.1076049804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 614.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.004966530948877335,
"kl": 0.05828094482421875,
"learning_rate": 2.5277777777777778e-06,
"loss": 0.0011,
"num_tokens": 35794664.0,
"reward": 0.7248979806900024,
"reward_std": 0.16931957006454468,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/asymmetric_l2_reward": 0.6382572650909424,
"rewards/final_brier_reward_step": 0.5170074105262756,
"rewards/format_reward_step": 0.9609375,
"step": 109
},
{
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.49204129204129204,
"calib/avg_num_step_conf": 9.16015625,
"calib/ece": 0.3455859375000001,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0001591741591741025,
"calib/mean_conf": 0.9901171875000001,
"calib/mu_c": 0.990060606060606,
"calib/mu_w": 0.9902197802197801,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3455859375000001,
"calib/std_conf": 0.0010761701026528064,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9327749029754205,
"calib/step_q_c_n": 1546.0,
"calib/step_q_gap": 0.004339358544882432,
"calib/step_q_w": 0.9284355444305381,
"calib/step_q_w_n": 799.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1351.0,
"completions/max_terminated_length": 1351.0,
"completions/mean_length": 798.1796875,
"completions/mean_terminated_length": 801.3098754882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 504.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.005862347781658173,
"kl": 0.06954193115234375,
"learning_rate": 2.5e-06,
"loss": 0.0115,
"num_tokens": 36103918.0,
"reward": 0.8563417196273804,
"reward_std": 0.24133282899856567,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/asymmetric_l2_reward": 0.7323915958404541,
"rewards/final_brier_reward_step": 0.651385486125946,
"rewards/format_reward_step": 1.0,
"step": 110
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.508294930875576,
"calib/avg_num_step_conf": 8.734375,
"calib/ece": 0.3772727272727272,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00037063857801167277,
"calib/mean_conf": 0.9899209486166007,
"calib/mu_c": 0.9900645161290321,
"calib/mu_w": 0.9896938775510205,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3772727272727272,
"calib/std_conf": 0.0021764268613337894,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.932752100840336,
"calib/step_q_c_n": 1428.0,
"calib/step_q_gap": 0.008841209751227153,
"calib/step_q_w": 0.9239108910891088,
"calib/step_q_w_n": 808.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2145.0,
"completions/max_terminated_length": 2145.0,
"completions/mean_length": 869.59765625,
"completions/mean_terminated_length": 879.9091186523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 581.0,
"epoch": 0.1184,
"grad_norm": 0.00547254690900445,
"kl": 0.061725616455078125,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.0046,
"num_tokens": 36433943.0,
"reward": 0.8111904263496399,
"reward_std": 0.2672760486602783,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/asymmetric_l2_reward": 0.6959226131439209,
"rewards/final_brier_reward_step": 0.6092706918716431,
"rewards/format_reward_step": 0.98046875,
"step": 111
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.492980195333695,
"calib/avg_num_step_conf": 8.5,
"calib/ece": 0.3796787148594378,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00014039609332594516,
"calib/mean_conf": 0.990120481927711,
"calib/mu_c": 0.9900657894736843,
"calib/mu_w": 0.9902061855670102,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3796787148594378,
"calib/std_conf": 0.0010910102576069185,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.934830623306233,
"calib/step_q_c_n": 1476.0,
"calib/step_q_gap": 0.020387766163375898,
"calib/step_q_w": 0.9144428571428571,
"calib/step_q_w_n": 700.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 1628.0,
"completions/max_terminated_length": 1628.0,
"completions/mean_length": 866.765625,
"completions/mean_terminated_length": 887.5680541992188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 597.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.0054887994192540646,
"kl": 0.05815887451171875,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.0374,
"num_tokens": 36763755.0,
"reward": 0.8039360046386719,
"reward_std": 0.17428997159004211,
"rewards/accuracy_reward_step": 0.59375,
"rewards/asymmetric_l2_reward": 0.6935150027275085,
"rewards/final_brier_reward_step": 0.6010757684707642,
"rewards/format_reward_step": 0.97265625,
"step": 112
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5083067517278044,
"calib/avg_num_step_conf": 9.53125,
"calib/ece": 0.3844223107569721,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00016679957469456497,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.9900657894736843,
"calib/mu_w": 0.9898989898989897,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3844223107569721,
"calib/std_conf": 0.0008926436853549044,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9353999999999999,
"calib/step_q_c_n": 1500.0,
"calib/step_q_gap": 0.007368085106382871,
"calib/step_q_w": 0.928031914893617,
"calib/step_q_w_n": 940.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2108.0,
"completions/max_terminated_length": 2108.0,
"completions/mean_length": 840.62109375,
"completions/mean_terminated_length": 853.96435546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 571.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.005846248008310795,
"kl": 0.06928253173828125,
"learning_rate": 2.4166666666666667e-06,
"loss": -0.0151,
"num_tokens": 37084154.0,
"reward": 0.8061826229095459,
"reward_std": 0.23986557126045227,
"rewards/accuracy_reward_step": 0.59375,
"rewards/asymmetric_l2_reward": 0.6960577964782715,
"rewards/final_brier_reward_step": 0.6014636754989624,
"rewards/format_reward_step": 0.98046875,
"step": 113
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4959439810208923,
"calib/avg_num_step_conf": 9.890625,
"calib/ece": 0.27976190476190477,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -8.11203795821891e-05,
"calib/mean_conf": 0.9900793650793651,
"calib/mu_c": 0.9900558659217875,
"calib/mu_w": 0.9901369863013697,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27976190476190477,
"calib/std_conf": 0.0008873285624999172,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9386128364389235,
"calib/step_q_c_n": 1932.0,
"calib/step_q_gap": 0.01744616977225677,
"calib/step_q_w": 0.9211666666666667,
"calib/step_q_w_n": 600.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1797.0,
"completions/max_terminated_length": 1797.0,
"completions/mean_length": 843.71484375,
"completions/mean_terminated_length": 853.7194213867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 601.0,
"epoch": 0.1216,
"grad_norm": 0.005540243349969387,
"kl": 0.0713348388671875,
"learning_rate": 2.388888888888889e-06,
"loss": -0.0212,
"num_tokens": 37405169.0,
"reward": 0.9041121006011963,
"reward_std": 0.21608759462833405,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/asymmetric_l2_reward": 0.7667593955993652,
"rewards/final_brier_reward_step": 0.7047460675239563,
"rewards/format_reward_step": 0.984375,
"step": 114
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 9.14453125,
"calib/ece": 0.4384126984126985,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.1102230246251565e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4384126984126985,
"calib/std_conf": 0.0008908708063747488,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9359250936329588,
"calib/step_q_c_n": 1335.0,
"calib/step_q_gap": 0.007992688066358444,
"calib/step_q_w": 0.9279324055666004,
"calib/step_q_w_n": 1006.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2821.0,
"completions/max_terminated_length": 2821.0,
"completions/mean_length": 874.20703125,
"completions/mean_terminated_length": 881.090576171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 647.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.005463681183755398,
"kl": 0.0593414306640625,
"learning_rate": 2.361111111111111e-06,
"loss": 0.0265,
"num_tokens": 37734230.0,
"reward": 0.7447854280471802,
"reward_std": 0.2918502688407898,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/asymmetric_l2_reward": 0.6380273699760437,
"rewards/final_brier_reward_step": 0.5476371049880981,
"rewards/format_reward_step": 0.9765625,
"step": 115
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5059880239520959,
"calib/avg_num_step_conf": 9.203125,
"calib/ece": 0.33000000000000007,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00011976047904205256,
"calib/mean_conf": 0.9900790513833992,
"calib/mu_c": 0.9901197604790417,
"calib/mu_w": 0.9899999999999997,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.33000000000000007,
"calib/std_conf": 0.0008855872135339169,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.93751329001772,
"calib/step_q_c_n": 1693.0,
"calib/step_q_gap": 0.013365477046377694,
"calib/step_q_w": 0.9241478129713423,
"calib/step_q_w_n": 663.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2350.0,
"completions/max_terminated_length": 2350.0,
"completions/mean_length": 885.22265625,
"completions/mean_terminated_length": 892.1929321289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 527.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.005527203902602196,
"kl": 0.06006622314453125,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.004,
"num_tokens": 38065367.0,
"reward": 0.8455690741539001,
"reward_std": 0.22002533078193665,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/asymmetric_l2_reward": 0.7151409387588501,
"rewards/final_brier_reward_step": 0.6509972810745239,
"rewards/format_reward_step": 0.97265625,
"step": 116
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 9.328125,
"calib/ece": 0.4821259842519685,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 2.220446049250313e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.9900000000000001,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.4821259842519685,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.9390930053804766,
"calib/step_q_c_n": 1301.0,
"calib/step_q_gap": 0.008163842546989875,
"calib/step_q_w": 0.9309291628334867,
"calib/step_q_w_n": 1087.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1801.0,
"completions/max_terminated_length": 1801.0,
"completions/mean_length": 856.30078125,
"completions/mean_terminated_length": 863.0432739257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 624.0,
"epoch": 0.1248,
"grad_norm": 0.0059682149440050125,
"kl": 0.0672149658203125,
"learning_rate": 2.305555555555556e-06,
"loss": 0.0093,
"num_tokens": 38391180.0,
"reward": 0.7150706648826599,
"reward_std": 0.28290462493896484,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/asymmetric_l2_reward": 0.6275831460952759,
"rewards/final_brier_reward_step": 0.505683183670044,
"rewards/format_reward_step": 0.98046875,
"step": 117
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.495,
"calib/avg_num_step_conf": 9.69921875,
"calib/ece": 0.385292490118577,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -8.999999999992347e-05,
"calib/mean_conf": 0.9900355731225295,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9900899999999999,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.385292490118577,
"calib/std_conf": 0.0005647058134288065,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9361940768746063,
"calib/step_q_c_n": 1587.0,
"calib/step_q_gap": 0.006682916160320573,
"calib/step_q_w": 0.9295111607142857,
"calib/step_q_w_n": 896.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2272.0,
"completions/max_terminated_length": 2272.0,
"completions/mean_length": 882.9375,
"completions/mean_terminated_length": 886.4000244140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 558.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.005273620132356882,
"kl": 0.06341552734375,
"learning_rate": 2.277777777777778e-06,
"loss": -0.0084,
"num_tokens": 38721220.0,
"reward": 0.7888277769088745,
"reward_std": 0.21463337540626526,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/asymmetric_l2_reward": 0.65985506772995,
"rewards/final_brier_reward_step": 0.6013941168785095,
"rewards/format_reward_step": 0.984375,
"step": 118
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5083723137414412,
"calib/avg_num_step_conf": 9.23828125,
"calib/ece": 0.3893951612903227,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00016744627482856433,
"calib/mean_conf": 0.990201612903226,
"calib/mu_c": 0.9902684563758386,
"calib/mu_w": 0.99010101010101,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3893951612903227,
"calib/std_conf": 0.0014055181498333384,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9335238095238095,
"calib/step_q_c_n": 1365.0,
"calib/step_q_gap": -0.0021461904761905792,
"calib/step_q_w": 0.9356700000000001,
"calib/step_q_w_n": 1000.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2580.0,
"completions/max_terminated_length": 2580.0,
"completions/mean_length": 881.9140625,
"completions/mean_terminated_length": 906.706787109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 648.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.005632383283227682,
"kl": 0.05834197998046875,
"learning_rate": 2.25e-06,
"loss": -0.0389,
"num_tokens": 39052054.0,
"reward": 0.7916051149368286,
"reward_std": 0.2506682276725769,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.6834613084793091,
"rewards/final_brier_reward_step": 0.5895925760269165,
"rewards/format_reward_step": 0.96875,
"step": 119
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.4935897435897436,
"calib/avg_num_step_conf": 8.703125,
"calib/ece": 0.3084081632653062,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00012820512820477248,
"calib/mean_conf": 0.9900408163265306,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9901282051282047,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3084081632653062,
"calib/std_conf": 0.0006375714021148297,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9386550060313631,
"calib/step_q_c_n": 1658.0,
"calib/step_q_gap": 0.021988339364696463,
"calib/step_q_w": 0.9166666666666666,
"calib/step_q_w_n": 570.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2031.0,
"completions/max_terminated_length": 2031.0,
"completions/mean_length": 839.234375,
"completions/mean_terminated_length": 869.8137817382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 477.0,
"epoch": 0.128,
"grad_norm": 0.005683529190719128,
"kl": 0.06261444091796875,
"learning_rate": 2.222222222222222e-06,
"loss": -0.0744,
"num_tokens": 39373586.0,
"reward": 0.8539406061172485,
"reward_std": 0.18444295227527618,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/asymmetric_l2_reward": 0.7324292063713074,
"rewards/final_brier_reward_step": 0.6543581485748291,
"rewards/format_reward_step": 0.953125,
"step": 120
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.234375,
"calib/ece": 0.37247011952191234,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -2.220446049250313e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9900000000000001,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.37247011952191234,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9339252336448599,
"calib/step_q_c_n": 1391.0,
"calib/step_q_gap": 0.01022927827526443,
"calib/step_q_w": 0.9236959553695955,
"calib/step_q_w_n": 717.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1811.0,
"completions/max_terminated_length": 1811.0,
"completions/mean_length": 894.49609375,
"completions/mean_terminated_length": 908.6945190429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 543.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.005683033261448145,
"kl": 0.052341461181640625,
"learning_rate": 2.1944444444444445e-06,
"loss": -0.0099,
"num_tokens": 39707633.0,
"reward": 0.810142993927002,
"reward_std": 0.2799479365348816,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/asymmetric_l2_reward": 0.6949148178100586,
"rewards/final_brier_reward_step": 0.6089648008346558,
"rewards/format_reward_step": 0.9765625,
"step": 121
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 9.04296875,
"calib/ece": 0.37095238095238103,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.1102230246251565e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9900000000000001,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.37095238095238103,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9404286628278952,
"calib/step_q_c_n": 1563.0,
"calib/step_q_gap": 0.015561641551299465,
"calib/step_q_w": 0.9248670212765957,
"calib/step_q_w_n": 752.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2896.0,
"completions/max_terminated_length": 2896.0,
"completions/mean_length": 886.609375,
"completions/mean_terminated_length": 893.590576171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 599.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.005477050319314003,
"kl": 0.05792999267578125,
"learning_rate": 2.166666666666667e-06,
"loss": -0.0096,
"num_tokens": 40041949.0,
"reward": 0.8290457725524902,
"reward_std": 0.15693755447864532,
"rewards/accuracy_reward_step": 0.609375,
"rewards/asymmetric_l2_reward": 0.7225650548934937,
"rewards/final_brier_reward_step": 0.6167765259742737,
"rewards/format_reward_step": 0.984375,
"step": 122
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.49900064474532557,
"calib/avg_num_step_conf": 8.35546875,
"calib/ece": 0.4283266932270916,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.9987105093433044e-05,
"calib/mean_conf": 0.9900796812749003,
"calib/mu_c": 0.9900709219858154,
"calib/mu_w": 0.9900909090909088,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4283266932270916,
"calib/std_conf": 0.0008890802232837218,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9342648253452477,
"calib/step_q_c_n": 1231.0,
"calib/step_q_gap": 0.006148085257141944,
"calib/step_q_w": 0.9281167400881057,
"calib/step_q_w_n": 908.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2574.0,
"completions/max_terminated_length": 2574.0,
"completions/mean_length": 926.16015625,
"completions/mean_terminated_length": 937.142333984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 605.0,
"epoch": 0.1312,
"grad_norm": 0.005605003330856562,
"kl": 0.050136566162109375,
"learning_rate": 2.138888888888889e-06,
"loss": -0.0243,
"num_tokens": 40384334.0,
"reward": 0.752850353717804,
"reward_std": 0.23959524929523468,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/asymmetric_l2_reward": 0.640251100063324,
"rewards/final_brier_reward_step": 0.5591995716094971,
"rewards/format_reward_step": 0.98046875,
"step": 123
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.640625,
"calib/ece": 0.30200000000000016,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 2.220446049250313e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9899999999999997,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30200000000000016,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9351959544879899,
"calib/step_q_c_n": 1582.0,
"calib/step_q_gap": 0.015354684646720074,
"calib/step_q_w": 0.9198412698412698,
"calib/step_q_w_n": 630.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2876.0,
"completions/max_terminated_length": 2876.0,
"completions/mean_length": 900.5390625,
"completions/mean_terminated_length": 914.8333740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 613.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.005689248442649841,
"kl": 0.06095123291015625,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.0013,
"num_tokens": 40721688.0,
"reward": 0.8809398412704468,
"reward_std": 0.19557476043701172,
"rewards/accuracy_reward_step": 0.671875,
"rewards/asymmetric_l2_reward": 0.7551801204681396,
"rewards/final_brier_reward_step": 0.6777933835983276,
"rewards/format_reward_step": 0.97265625,
"step": 124
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5073529411764706,
"calib/avg_num_step_conf": 7.828125,
"calib/ece": 0.4372357723577236,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00014705882352983313,
"calib/mean_conf": 0.9900813008130082,
"calib/mu_c": 0.9901470588235296,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4372357723577236,
"calib/std_conf": 0.0008979968306656318,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9339572649572649,
"calib/step_q_c_n": 1170.0,
"calib/step_q_gap": 0.010084363278607844,
"calib/step_q_w": 0.9238729016786571,
"calib/step_q_w_n": 834.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2943.0,
"completions/max_terminated_length": 2943.0,
"completions/mean_length": 972.515625,
"completions/mean_terminated_length": 980.1732177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 645.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.005403018090873957,
"kl": 0.0468597412109375,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.0194,
"num_tokens": 41075460.0,
"reward": 0.7486984133720398,
"reward_std": 0.25946474075317383,
"rewards/accuracy_reward_step": 0.53125,
"rewards/asymmetric_l2_reward": 0.6592109203338623,
"rewards/final_brier_reward_step": 0.5397484302520752,
"rewards/format_reward_step": 0.9609375,
"step": 125
},
{
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5049019607843137,
"calib/avg_num_step_conf": 8.578125,
"calib/ece": 0.40616326530612246,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00039215686274518546,
"calib/mean_conf": 0.9898367346938776,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.9896078431372546,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.40616326530612246,
"calib/std_conf": 0.002550285608459317,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9396590909090908,
"calib/step_q_c_n": 1408.0,
"calib/step_q_gap": 0.014443354868481806,
"calib/step_q_w": 0.925215736040609,
"calib/step_q_w_n": 788.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2551.0,
"completions/max_terminated_length": 2551.0,
"completions/mean_length": 887.69921875,
"completions/mean_terminated_length": 905.3825073242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 585.0,
"epoch": 0.1344,
"grad_norm": 0.004927393514662981,
"kl": 0.054393768310546875,
"learning_rate": 2.0555555555555555e-06,
"loss": 0.0065,
"num_tokens": 41408175.0,
"reward": 0.7703766822814941,
"reward_std": 0.22320319712162018,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/asymmetric_l2_reward": 0.6708583831787109,
"rewards/final_brier_reward_step": 0.5667698979377747,
"rewards/format_reward_step": 0.95703125,
"step": 126
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4993502274204028,
"calib/avg_num_step_conf": 8.5234375,
"calib/ece": 0.4479076305220885,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -4.873294347174095e-06,
"calib/mean_conf": 0.9900763052208836,
"calib/mu_c": 0.9900740740740739,
"calib/mu_w": 0.9900789473684211,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4479076305220885,
"calib/std_conf": 0.0008491673237872525,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9387752053771471,
"calib/step_q_c_n": 1339.0,
"calib/step_q_gap": 0.017132263502888545,
"calib/step_q_w": 0.9216429418742585,
"calib/step_q_w_n": 843.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3033.0,
"completions/max_terminated_length": 3033.0,
"completions/mean_length": 862.7578125,
"completions/mean_terminated_length": 872.9881591796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 512.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.00600149342790246,
"kl": 0.056011199951171875,
"learning_rate": 2.027777777777778e-06,
"loss": -0.0121,
"num_tokens": 41732713.0,
"reward": 0.7328608632087708,
"reward_std": 0.23117367923259735,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/asymmetric_l2_reward": 0.630497395992279,
"rewards/final_brier_reward_step": 0.536005437374115,
"rewards/format_reward_step": 0.96875,
"step": 127
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5037593984962406,
"calib/avg_num_step_conf": 8.27734375,
"calib/ece": 0.45590361445783145,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 7.518796992500576e-05,
"calib/mean_conf": 0.9900401606425704,
"calib/mu_c": 0.9900751879699249,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.45590361445783145,
"calib/std_conf": 0.0006324504316475356,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9362500000000002,
"calib/step_q_c_n": 1224.0,
"calib/step_q_gap": 0.011155027932961081,
"calib/step_q_w": 0.9250949720670392,
"calib/step_q_w_n": 895.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3013.0,
"completions/max_terminated_length": 3013.0,
"completions/mean_length": 862.859375,
"completions/mean_terminated_length": 883.5680541992188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 553.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.005881587043404579,
"kl": 0.0509796142578125,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0153,
"num_tokens": 42060269.0,
"reward": 0.7400556802749634,
"reward_std": 0.25382572412490845,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/asymmetric_l2_reward": 0.65317702293396,
"rewards/final_brier_reward_step": 0.5284968614578247,
"rewards/format_reward_step": 0.97265625,
"step": 128
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4971116590348714,
"calib/avg_num_step_conf": 9.0625,
"calib/ece": 0.32738095238095244,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -5.776681930247385e-05,
"calib/mean_conf": 0.9900793650793651,
"calib/mu_c": 0.9900598802395209,
"calib/mu_w": 0.9901176470588233,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32738095238095244,
"calib/std_conf": 0.0008873285624999172,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9375094102885823,
"calib/step_q_c_n": 1594.0,
"calib/step_q_gap": 0.008900594861584898,
"calib/step_q_w": 0.9286088154269974,
"calib/step_q_w_n": 726.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1943.0,
"completions/max_terminated_length": 1943.0,
"completions/mean_length": 842.69921875,
"completions/mean_terminated_length": 849.3346557617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 607.0,
"epoch": 0.1376,
"grad_norm": 0.005735983140766621,
"kl": 0.05672454833984375,
"learning_rate": 1.9722222222222224e-06,
"loss": 0.0004,
"num_tokens": 42378384.0,
"reward": 0.8612784147262573,
"reward_std": 0.17804312705993652,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/asymmetric_l2_reward": 0.7364044785499573,
"rewards/final_brier_reward_step": 0.6588085889816284,
"rewards/format_reward_step": 0.984375,
"step": 129
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5175119080975064,
"calib/avg_num_step_conf": 9.140625,
"calib/ece": 0.3312698412698414,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0003530400672457912,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9901204819277106,
"calib/mu_w": 0.9897674418604648,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3312698412698414,
"calib/std_conf": 0.0012598815766974253,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.941146572104019,
"calib/step_q_c_n": 1692.0,
"calib/step_q_gap": 0.01682558444969795,
"calib/step_q_w": 0.924320987654321,
"calib/step_q_w_n": 648.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1581.0,
"completions/max_terminated_length": 1581.0,
"completions/mean_length": 828.35546875,
"completions/mean_terminated_length": 831.6039428710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 475.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.005649202037602663,
"kl": 0.05800628662109375,
"learning_rate": 1.944444444444445e-06,
"loss": 0.0021,
"num_tokens": 42695731.0,
"reward": 0.8597501516342163,
"reward_std": 0.16437068581581116,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/asymmetric_l2_reward": 0.7433483004570007,
"rewards/final_brier_reward_step": 0.6511518955230713,
"rewards/format_reward_step": 0.9765625,
"step": 130
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.49642857142857144,
"calib/avg_num_step_conf": 8.43359375,
"calib/ece": 0.5545564516129033,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -7.142857142861114e-05,
"calib/mean_conf": 0.9900403225806452,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9900714285714285,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.5545564516129033,
"calib/std_conf": 0.0006337190986089405,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9365694849368319,
"calib/step_q_c_n": 1029.0,
"calib/step_q_gap": 0.010888069007628398,
"calib/step_q_w": 0.9256814159292035,
"calib/step_q_w_n": 1130.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2980.0,
"completions/max_terminated_length": 2980.0,
"completions/mean_length": 841.28515625,
"completions/mean_terminated_length": 858.0438232421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 606.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.005765112116932869,
"kl": 0.05020904541015625,
"learning_rate": 1.916666666666667e-06,
"loss": -0.0213,
"num_tokens": 43017308.0,
"reward": 0.6251735687255859,
"reward_std": 0.20246624946594238,
"rewards/accuracy_reward_step": 0.421875,
"rewards/asymmetric_l2_reward": 0.5395841002464294,
"rewards/final_brier_reward_step": 0.43263787031173706,
"rewards/format_reward_step": 0.96875,
"step": 131
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5093550673281361,
"calib/avg_num_step_conf": 9.55078125,
"calib/ece": 0.3288804780876493,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00019532246633613415,
"calib/mean_conf": 0.990235059760956,
"calib/mu_c": 0.9903012048192771,
"calib/mu_w": 0.990105882352941,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3288804780876493,
"calib/std_conf": 0.0015031593211086455,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9406541490006057,
"calib/step_q_c_n": 1651.0,
"calib/step_q_gap": 0.005754904668112126,
"calib/step_q_w": 0.9348992443324936,
"calib/step_q_w_n": 794.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1554.0,
"completions/max_terminated_length": 1554.0,
"completions/mean_length": 829.8125,
"completions/mean_terminated_length": 842.9841918945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 519.0,
"epoch": 0.1408,
"grad_norm": 0.005729543976485729,
"kl": 0.05913543701171875,
"learning_rate": 1.888888888888889e-06,
"loss": -0.0197,
"num_tokens": 43335332.0,
"reward": 0.8531327843666077,
"reward_std": 0.29228276014328003,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/asymmetric_l2_reward": 0.726431131362915,
"rewards/final_brier_reward_step": 0.6548343896865845,
"rewards/format_reward_step": 0.9765625,
"step": 132
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5042016806722689,
"calib/avg_num_step_conf": 8.046875,
"calib/ece": 0.5159362549800797,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 8.40336134451114e-05,
"calib/mean_conf": 0.9900398406374502,
"calib/mu_c": 0.9900840336134452,
"calib/mu_w": 0.9900000000000001,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.5159362549800797,
"calib/std_conf": 0.0006299357888781636,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9339700000000001,
"calib/step_q_c_n": 1000.0,
"calib/step_q_gap": 0.005111509433962325,
"calib/step_q_w": 0.9288584905660378,
"calib/step_q_w_n": 1060.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1580.0,
"completions/max_terminated_length": 1580.0,
"completions/mean_length": 882.74609375,
"completions/mean_terminated_length": 900.3306884765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 488.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.00549586396664381,
"kl": 0.04437255859375,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.0307,
"num_tokens": 43667659.0,
"reward": 0.6816405057907104,
"reward_std": 0.3141142427921295,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/asymmetric_l2_reward": 0.599159836769104,
"rewards/final_brier_reward_step": 0.47505855560302734,
"rewards/format_reward_step": 0.98046875,
"step": 133
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.511455525606469,
"calib/avg_num_step_conf": 8.0078125,
"calib/ece": 0.4090118577075099,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00023039404440994282,
"calib/mean_conf": 0.9900395256916996,
"calib/mu_c": 0.9901360544217686,
"calib/mu_w": 0.9899056603773586,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4090118577075099,
"calib/std_conf": 0.0010882134306668947,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9377241379310345,
"calib/step_q_c_n": 1305.0,
"calib/step_q_gap": 0.01595232585049755,
"calib/step_q_w": 0.9217718120805369,
"calib/step_q_w_n": 745.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1938.0,
"completions/max_terminated_length": 1938.0,
"completions/mean_length": 875.828125,
"completions/mean_terminated_length": 886.2135009765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 530.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.005863112863153219,
"kl": 0.0474090576171875,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.0197,
"num_tokens": 44000823.0,
"reward": 0.7867439985275269,
"reward_std": 0.22116048634052277,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/asymmetric_l2_reward": 0.6831960678100586,
"rewards/final_brier_reward_step": 0.5785729885101318,
"rewards/format_reward_step": 0.984375,
"step": 134
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5033234383066598,
"calib/avg_num_step_conf": 8.5546875,
"calib/ece": 0.40102766798418976,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00025942178626736645,
"calib/mean_conf": 0.9899604743083005,
"calib/mu_c": 0.9900671140939596,
"calib/mu_w": 0.9898076923076923,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.40102766798418976,
"calib/std_conf": 0.002084769485255432,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.9392614047791454,
"calib/step_q_c_n": 1381.0,
"calib/step_q_gap": 0.01067055187432464,
"calib/step_q_w": 0.9285908529048208,
"calib/step_q_w_n": 809.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2331.0,
"completions/max_terminated_length": 2331.0,
"completions/mean_length": 892.8046875,
"completions/mean_terminated_length": 896.305908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 585.0,
"epoch": 0.144,
"grad_norm": 0.005807646084576845,
"kl": 0.051166534423828125,
"learning_rate": 1.8055555555555557e-06,
"loss": 0.0149,
"num_tokens": 44335261.0,
"reward": 0.7879598140716553,
"reward_std": 0.24510522186756134,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.6771945953369141,
"rewards/final_brier_reward_step": 0.5862249135971069,
"rewards/format_reward_step": 0.98046875,
"step": 135
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.537602933607823,
"calib/avg_num_step_conf": 8.5390625,
"calib/ece": 0.45431999999999995,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0007578486875965407,
"calib/mean_conf": 0.99032,
"calib/mu_c": 0.9906716417910447,
"calib/mu_w": 0.9899137931034482,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.45431999999999995,
"calib/std_conf": 0.001974234028680492,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9403050595238097,
"calib/step_q_c_n": 1344.0,
"calib/step_q_gap": 0.016124536958488922,
"calib/step_q_w": 0.9241805225653208,
"calib/step_q_w_n": 842.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2198.0,
"completions/max_terminated_length": 2198.0,
"completions/mean_length": 855.94921875,
"completions/mean_terminated_length": 873.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 537.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.005918469280004501,
"kl": 0.0530548095703125,
"learning_rate": 1.777777777777778e-06,
"loss": -0.0125,
"num_tokens": 44662872.0,
"reward": 0.7209823131561279,
"reward_std": 0.2641390860080719,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/asymmetric_l2_reward": 0.612058699131012,
"rewards/final_brier_reward_step": 0.5322495698928833,
"rewards/format_reward_step": 0.96484375,
"step": 136
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.49514563106796117,
"calib/avg_num_step_conf": 8.859375,
"calib/ece": 0.40204000000000006,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -9.70873786407811e-05,
"calib/mean_conf": 0.99004,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.9900970873786406,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.40204000000000006,
"calib/std_conf": 0.0006311893535223806,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9392547906316535,
"calib/step_q_c_n": 1409.0,
"calib/step_q_gap": 0.008323475148533488,
"calib/step_q_w": 0.93093131548312,
"calib/step_q_w_n": 859.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2289.0,
"completions/max_terminated_length": 2289.0,
"completions/mean_length": 856.4609375,
"completions/mean_terminated_length": 866.6166381835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 628.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.005562786012887955,
"kl": 0.056423187255859375,
"learning_rate": 1.75e-06,
"loss": -0.0209,
"num_tokens": 44989110.0,
"reward": 0.7751529812812805,
"reward_std": 0.22666768729686737,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/asymmetric_l2_reward": 0.6627465486526489,
"rewards/final_brier_reward_step": 0.5781843662261963,
"rewards/format_reward_step": 0.97265625,
"step": 137
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.496821172469875,
"calib/avg_num_step_conf": 9.07421875,
"calib/ece": 0.31669354838709685,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -6.357655060262868e-05,
"calib/mean_conf": 0.9900806451612904,
"calib/mu_c": 0.9900598802395209,
"calib/mu_w": 0.9901234567901235,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.31669354838709685,
"calib/std_conf": 0.0008943981053555988,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9411660561660562,
"calib/step_q_c_n": 1638.0,
"calib/step_q_gap": 0.012348537917881042,
"calib/step_q_w": 0.9288175182481752,
"calib/step_q_w_n": 685.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 1652.0,
"completions/max_terminated_length": 1652.0,
"completions/mean_length": 856.09375,
"completions/mean_terminated_length": 880.1605834960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 548.0,
"epoch": 0.1472,
"grad_norm": 0.008828936144709587,
"kl": 0.07733917236328125,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.048,
"num_tokens": 45312606.0,
"reward": 0.8505880832672119,
"reward_std": 0.26470452547073364,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/asymmetric_l2_reward": 0.723146915435791,
"rewards/final_brier_reward_step": 0.6545917987823486,
"rewards/format_reward_step": 0.96484375,
"step": 138
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4989772727272728,
"calib/avg_num_step_conf": 10.08984375,
"calib/ece": 0.2889203187250996,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -2.537878787833847e-05,
"calib/mean_conf": 0.9901155378486056,
"calib/mu_c": 0.9901079545454546,
"calib/mu_w": 0.990133333333333,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2889203187250996,
"calib/std_conf": 0.001051747554259193,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9446305931321539,
"calib/step_q_c_n": 1922.0,
"calib/step_q_gap": 0.012860547746374684,
"calib/step_q_w": 0.9317700453857792,
"calib/step_q_w_n": 661.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2799.0,
"completions/max_terminated_length": 2799.0,
"completions/mean_length": 834.8203125,
"completions/mean_terminated_length": 848.0714721679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 538.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.005658579058945179,
"kl": 0.06348419189453125,
"learning_rate": 1.6944444444444446e-06,
"loss": -0.0179,
"num_tokens": 45629416.0,
"reward": 0.9023306965827942,
"reward_std": 0.2051040083169937,
"rewards/accuracy_reward_step": 0.6875,
"rewards/asymmetric_l2_reward": 0.7778832316398621,
"rewards/final_brier_reward_step": 0.6931843161582947,
"rewards/format_reward_step": 0.98046875,
"step": 139
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4888345864661654,
"calib/avg_num_step_conf": 10.1640625,
"calib/ece": 0.2930278884462152,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0002233082706766254,
"calib/mean_conf": 0.9902390438247013,
"calib/mu_c": 0.9901714285714284,
"calib/mu_w": 0.990394736842105,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2930278884462152,
"calib/std_conf": 0.001527513108580146,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.946251287332647,
"calib/step_q_c_n": 1942.0,
"calib/step_q_gap": 0.011372499453859097,
"calib/step_q_w": 0.9348787878787879,
"calib/step_q_w_n": 660.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1917.0,
"completions/max_terminated_length": 1917.0,
"completions/mean_length": 853.45703125,
"completions/mean_terminated_length": 863.5770874023438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 566.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.010869499295949936,
"kl": 0.07050323486328125,
"learning_rate": 1.6666666666666667e-06,
"loss": -0.0113,
"num_tokens": 45952917.0,
"reward": 0.8823206424713135,
"reward_std": 0.2503645420074463,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/asymmetric_l2_reward": 0.7434866428375244,
"rewards/final_brier_reward_step": 0.6891233921051025,
"rewards/format_reward_step": 0.9765625,
"step": 140
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5058139534883721,
"calib/avg_num_step_conf": 9.2578125,
"calib/ece": 0.2965322580645161,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.000116279069767522,
"calib/mean_conf": 0.9900806451612904,
"calib/mu_c": 0.9901162790697673,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2965322580645161,
"calib/std_conf": 0.0008943981053555988,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.9439332603938732,
"calib/step_q_c_n": 1828.0,
"calib/step_q_gap": 0.020298574047009632,
"calib/step_q_w": 0.9236346863468635,
"calib/step_q_w_n": 542.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2350.0,
"completions/max_terminated_length": 2350.0,
"completions/mean_length": 893.41796875,
"completions/mean_terminated_length": 900.4527587890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.1504,
"grad_norm": 0.005139007233083248,
"kl": 0.05944061279296875,
"learning_rate": 1.638888888888889e-06,
"loss": -0.0251,
"num_tokens": 46288728.0,
"reward": 0.868787407875061,
"reward_std": 0.20100921392440796,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/asymmetric_l2_reward": 0.7364983558654785,
"rewards/final_brier_reward_step": 0.6737328171730042,
"rewards/format_reward_step": 0.9609375,
"step": 141
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5174994729074426,
"calib/avg_num_step_conf": 11.21484375,
"calib/ece": 0.36837398373983743,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00034998945814912474,
"calib/mean_conf": 0.9903252032520325,
"calib/mu_c": 0.9904575163398692,
"calib/mu_w": 0.9901075268817201,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.36837398373983743,
"calib/std_conf": 0.0017737743275830446,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9478168202764977,
"calib/step_q_c_n": 1736.0,
"calib/step_q_gap": -0.00021842201425137375,
"calib/step_q_w": 0.9480352422907491,
"calib/step_q_w_n": 1135.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2413.0,
"completions/max_terminated_length": 2413.0,
"completions/mean_length": 879.140625,
"completions/mean_terminated_length": 911.1741333007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 604.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.005070097278803587,
"kl": 0.05767822265625,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.0788,
"num_tokens": 46618948.0,
"reward": 0.7950658202171326,
"reward_std": 0.19968588650226593,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/asymmetric_l2_reward": 0.6783492565155029,
"rewards/final_brier_reward_step": 0.6008449196815491,
"rewards/format_reward_step": 0.95703125,
"step": 142
},
{
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.5051712328767124,
"calib/avg_num_step_conf": 11.78515625,
"calib/ece": 0.3965447154471545,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0004054794520548244,
"calib/mean_conf": 0.9900406504065041,
"calib/mu_c": 0.9902054794520547,
"calib/mu_w": 0.9897999999999999,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3965447154471545,
"calib/std_conf": 0.0029214607494034085,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9481639722863743,
"calib/step_q_c_n": 1732.0,
"calib/step_q_gap": -0.0004119032000069467,
"calib/step_q_w": 0.9485758754863812,
"calib/step_q_w_n": 1285.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2606.0,
"completions/max_terminated_length": 2606.0,
"completions/mean_length": 871.84375,
"completions/mean_terminated_length": 903.6113891601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 611.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.005727704614400864,
"kl": 0.06136322021484375,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.0503,
"num_tokens": 46949476.0,
"reward": 0.7622804045677185,
"reward_std": 0.24852487444877625,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/asymmetric_l2_reward": 0.6418509483337402,
"rewards/final_brier_reward_step": 0.5780222415924072,
"rewards/format_reward_step": 0.953125,
"step": 143
},
{
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.5115606936416185,
"calib/avg_num_step_conf": 12.29296875,
"calib/ece": 0.26932916666666673,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0002254335260113427,
"calib/mean_conf": 0.9901625000000001,
"calib/mu_c": 0.9902254335260113,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.26932916666666673,
"calib/std_conf": 0.001249437373380516,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9409575221238939,
"calib/step_q_c_n": 1695.0,
"calib/step_q_gap": -0.022279392476657,
"calib/step_q_w": 0.9632369146005509,
"calib/step_q_w_n": 1452.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2657.0,
"completions/max_terminated_length": 2657.0,
"completions/mean_length": 858.515625,
"completions/mean_terminated_length": 904.4443969726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 646.0,
"epoch": 0.1536,
"grad_norm": 0.00599814485758543,
"kl": 0.061065673828125,
"learning_rate": 1.5555555555555558e-06,
"loss": -0.0422,
"num_tokens": 47273384.0,
"reward": 0.8610941171646118,
"reward_std": 0.2805580794811249,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/asymmetric_l2_reward": 0.7203265428543091,
"rewards/final_brier_reward_step": 0.6807678937911987,
"rewards/format_reward_step": 0.9296875,
"step": 144
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5060240963855421,
"calib/avg_num_step_conf": 11.3203125,
"calib/ece": 0.32072580645161297,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00012048192771074717,
"calib/mean_conf": 0.9900806451612904,
"calib/mu_c": 0.9901204819277106,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32072580645161297,
"calib/std_conf": 0.0008943981053555989,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.944338682899834,
"calib/step_q_c_n": 1807.0,
"calib/step_q_gap": -0.007173691069001786,
"calib/step_q_w": 0.9515123739688358,
"calib/step_q_w_n": 1091.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2801.0,
"completions/max_terminated_length": 2801.0,
"completions/mean_length": 882.12109375,
"completions/mean_terminated_length": 903.2920532226562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 582.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.005229916889220476,
"kl": 0.06345367431640625,
"learning_rate": 1.527777777777778e-06,
"loss": -0.0314,
"num_tokens": 47601911.0,
"reward": 0.8385009765625,
"reward_std": 0.27948635816574097,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/asymmetric_l2_reward": 0.6988167762756348,
"rewards/final_brier_reward_step": 0.6547476053237915,
"rewards/format_reward_step": 0.96875,
"step": 145
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5034776902887138,
"calib/avg_num_step_conf": 10.2265625,
"calib/ece": 0.4760323886639676,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 6.955380577400305e-05,
"calib/mean_conf": 0.9902024291497975,
"calib/mu_c": 0.9902362204724408,
"calib/mu_w": 0.9901666666666668,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4760323886639676,
"calib/std_conf": 0.0014083017919778223,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9437317784256561,
"calib/step_q_c_n": 1372.0,
"calib/step_q_gap": 0.003041569757919471,
"calib/step_q_w": 0.9406902086677367,
"calib/step_q_w_n": 1246.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 3070.0,
"completions/max_terminated_length": 3070.0,
"completions/mean_length": 876.76171875,
"completions/mean_terminated_length": 901.4096069335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 547.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.005378072615712881,
"kl": 0.061309814453125,
"learning_rate": 1.5e-06,
"loss": -0.0446,
"num_tokens": 47933578.0,
"reward": 0.6947678327560425,
"reward_std": 0.23827584087848663,
"rewards/accuracy_reward_step": 0.5,
"rewards/asymmetric_l2_reward": 0.5913490056991577,
"rewards/final_brier_reward_step": 0.5052179098129272,
"rewards/format_reward_step": 0.96484375,
"step": 146
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.47405221630636984,
"calib/avg_num_step_conf": 11.54296875,
"calib/ece": 0.5144400000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.000518955673872834,
"calib/mean_conf": 0.9904400000000001,
"calib/mu_c": 0.9901680672268905,
"calib/mu_w": 0.9906870229007634,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.5144400000000001,
"calib/std_conf": 0.0020509509989270853,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9464776119402984,
"calib/step_q_c_n": 1340.0,
"calib/step_q_gap": -0.00118802273462415,
"calib/step_q_w": 0.9476656346749226,
"calib/step_q_w_n": 1615.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3019.0,
"completions/max_terminated_length": 3019.0,
"completions/mean_length": 884.7890625,
"completions/mean_terminated_length": 902.4143676757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 609.0,
"epoch": 0.1568,
"grad_norm": 0.005266894120723009,
"kl": 0.06313323974609375,
"learning_rate": 1.4722222222222225e-06,
"loss": -0.0287,
"num_tokens": 48263764.0,
"reward": 0.6583696007728577,
"reward_std": 0.1766577810049057,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/asymmetric_l2_reward": 0.5541763305664062,
"rewards/final_brier_reward_step": 0.4742816388607025,
"rewards/format_reward_step": 0.9765625,
"step": 147
},
{
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.5229885057471264,
"calib/avg_num_step_conf": 13.546875,
"calib/ece": 0.27427983539094647,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00045977011494247044,
"calib/mean_conf": 0.9903292181069958,
"calib/mu_c": 0.9904597701149424,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27427983539094647,
"calib/std_conf": 0.0017843196204673922,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9481062469257255,
"calib/step_q_c_n": 2033.0,
"calib/step_q_gap": -0.014729990008072402,
"calib/step_q_w": 0.9628362369337979,
"calib/step_q_w_n": 1435.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 1593.0,
"completions/max_terminated_length": 1593.0,
"completions/mean_length": 846.07421875,
"completions/mean_terminated_length": 884.0611572265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 585.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.006033976562321186,
"kl": 0.065948486328125,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.0728,
"num_tokens": 48585471.0,
"reward": 0.8637349605560303,
"reward_std": 0.29628562927246094,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/asymmetric_l2_reward": 0.7167023420333862,
"rewards/final_brier_reward_step": 0.6849863529205322,
"rewards/format_reward_step": 0.94921875,
"step": 148
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.49508228968592227,
"calib/avg_num_step_conf": 10.328125,
"calib/ece": 0.3910317460317462,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -9.835420628145908e-05,
"calib/mean_conf": 0.9902380952380954,
"calib/mu_c": 0.9901986754966886,
"calib/mu_w": 0.9902970297029701,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3910317460317462,
"calib/std_conf": 0.0015245533898649653,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9458948004836759,
"calib/step_q_c_n": 1654.0,
"calib/step_q_gap": 0.009228133817009154,
"calib/step_q_w": 0.9366666666666668,
"calib/step_q_w_n": 990.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1573.0,
"completions/max_terminated_length": 1573.0,
"completions/mean_length": 883.15234375,
"completions/mean_terminated_length": 893.62451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 613.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.005128260236233473,
"kl": 0.0658111572265625,
"learning_rate": 1.4166666666666667e-06,
"loss": -0.0232,
"num_tokens": 48916014.0,
"reward": 0.7963246703147888,
"reward_std": 0.24546730518341064,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/asymmetric_l2_reward": 0.680401623249054,
"rewards/final_brier_reward_step": 0.597403883934021,
"rewards/format_reward_step": 0.984375,
"step": 149
},
{
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.5257165605095542,
"calib/avg_num_step_conf": 12.86328125,
"calib/ece": 0.3281434599156119,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0005143312101910658,
"calib/mean_conf": 0.9905907172995781,
"calib/mu_c": 0.990764331210191,
"calib/mu_w": 0.99025,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3281434599156119,
"calib/std_conf": 0.002357589037080001,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9469197630586969,
"calib/step_q_c_n": 1857.0,
"calib/step_q_gap": -0.01014848206665131,
"calib/step_q_w": 0.9570682451253482,
"calib/step_q_w_n": 1436.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2986.0,
"completions/max_terminated_length": 2986.0,
"completions/mean_length": 848.53125,
"completions/mean_terminated_length": 890.2622680664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 604.0,
"epoch": 0.16,
"grad_norm": 0.00580455781891942,
"kl": 0.0648956298828125,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.059,
"num_tokens": 49238198.0,
"reward": 0.8208293318748474,
"reward_std": 0.2286253273487091,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/asymmetric_l2_reward": 0.7145583629608154,
"rewards/final_brier_reward_step": 0.61928790807724,
"rewards/format_reward_step": 0.92578125,
"step": 150
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5037756550922938,
"calib/avg_num_step_conf": 8.5703125,
"calib/ece": 0.4800763052208836,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 6.763908609774827e-05,
"calib/mean_conf": 0.9901164658634539,
"calib/mu_c": 0.9901496062992124,
"calib/mu_w": 0.9900819672131147,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4800763052208836,
"calib/std_conf": 0.0013849774764524091,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9414787066246058,
"calib/step_q_c_n": 1268.0,
"calib/step_q_gap": 0.026543501441020467,
"calib/step_q_w": 0.9149352051835853,
"calib/step_q_w_n": 926.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2751.0,
"completions/max_terminated_length": 2751.0,
"completions/mean_length": 912.84375,
"completions/mean_terminated_length": 934.7520141601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 583.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.005261977203190327,
"kl": 0.05495452880859375,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.0375,
"num_tokens": 49578910.0,
"reward": 0.7145134210586548,
"reward_std": 0.2643086314201355,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/asymmetric_l2_reward": 0.6306858062744141,
"rewards/final_brier_reward_step": 0.5053722262382507,
"rewards/format_reward_step": 0.96875,
"step": 151
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5045454545454545,
"calib/avg_num_step_conf": 9.4921875,
"calib/ece": 0.4369918699186992,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0003636363636365658,
"calib/mean_conf": 0.9898373983739838,
"calib/mu_c": 0.9900000000000001,
"calib/mu_w": 0.9896363636363635,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4369918699186992,
"calib/std_conf": 0.002545118023170494,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9398222222222222,
"calib/step_q_c_n": 1350.0,
"calib/step_q_gap": 0.00441481481481476,
"calib/step_q_w": 0.9354074074074075,
"calib/step_q_w_n": 1080.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2570.0,
"completions/max_terminated_length": 2570.0,
"completions/mean_length": 859.7890625,
"completions/mean_terminated_length": 883.9597778320312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 551.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.005733448546379805,
"kl": 0.05826568603515625,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.0428,
"num_tokens": 49904408.0,
"reward": 0.7376753091812134,
"reward_std": 0.24671050906181335,
"rewards/accuracy_reward_step": 0.53125,
"rewards/asymmetric_l2_reward": 0.6415493488311768,
"rewards/final_brier_reward_step": 0.5361449122428894,
"rewards/format_reward_step": 0.95703125,
"step": 152
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.51953125,
"calib/ece": 0.3540000000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.1102230246251565e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3540000000000001,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9335677276091784,
"calib/step_q_c_n": 1351.0,
"calib/step_q_gap": 0.00018218544050363494,
"calib/step_q_w": 0.9333855421686748,
"calib/step_q_w_n": 830.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2805.0,
"completions/max_terminated_length": 2805.0,
"completions/mean_length": 865.0390625,
"completions/mean_terminated_length": 875.2964477539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 579.0,
"epoch": 0.1632,
"grad_norm": 0.005814076401293278,
"kl": 0.055614471435546875,
"learning_rate": 1.3055555555555556e-06,
"loss": 0.0202,
"num_tokens": 50233178.0,
"reward": 0.8162698745727539,
"reward_std": 0.2791767120361328,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/asymmetric_l2_reward": 0.6849030256271362,
"rewards/final_brier_reward_step": 0.6281054019927979,
"rewards/format_reward_step": 0.9765625,
"step": 153
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5020976353928299,
"calib/avg_num_step_conf": 9.01171875,
"calib/ece": 0.4425793650793649,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 4.195270785656291e-05,
"calib/mean_conf": 0.9901984126984126,
"calib/mu_c": 0.9902173913043477,
"calib/mu_w": 0.9901754385964912,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4425793650793649,
"calib/std_conf": 0.001394546300857587,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9370293209876543,
"calib/step_q_c_n": 1296.0,
"calib/step_q_gap": 0.0033695781587720353,
"calib/step_q_w": 0.9336597428288823,
"calib/step_q_w_n": 1011.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1551.0,
"completions/max_terminated_length": 1551.0,
"completions/mean_length": 815.5703125,
"completions/mean_terminated_length": 828.5159301757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 492.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.006068110000342131,
"kl": 0.060028076171875,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.0165,
"num_tokens": 50546404.0,
"reward": 0.750443696975708,
"reward_std": 0.22490191459655762,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/asymmetric_l2_reward": 0.6540299654006958,
"rewards/final_brier_reward_step": 0.5437324047088623,
"rewards/format_reward_step": 0.9765625,
"step": 154
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.49916634603052457,
"calib/avg_num_step_conf": 8.2578125,
"calib/ece": 0.44027888446215147,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.6031807105409968e-05,
"calib/mean_conf": 0.9900796812749004,
"calib/mu_c": 0.9900724637681159,
"calib/mu_w": 0.9900884955752213,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.44027888446215147,
"calib/std_conf": 0.0012598715777563277,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9308571428571428,
"calib/step_q_c_n": 1190.0,
"calib/step_q_gap": 0.0027402597402598206,
"calib/step_q_w": 0.928116883116883,
"calib/step_q_w_n": 924.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1327.0,
"completions/max_terminated_length": 1327.0,
"completions/mean_length": 795.71875,
"completions/mean_terminated_length": 811.5697631835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 537.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.005755608901381493,
"kl": 0.0548095703125,
"learning_rate": 1.25e-06,
"loss": -0.0586,
"num_tokens": 50857324.0,
"reward": 0.7406020760536194,
"reward_std": 0.23273462057113647,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/asymmetric_l2_reward": 0.6313012838363647,
"rewards/final_brier_reward_step": 0.5475590229034424,
"rewards/format_reward_step": 0.97265625,
"step": 155
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5070422535211268,
"calib/avg_num_step_conf": 8.2421875,
"calib/ece": 0.4310236220472441,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0001408450704224462,
"calib/mean_conf": 0.9900787401574803,
"calib/mu_c": 0.9901408450704225,
"calib/mu_w": 0.9900000000000001,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4310236220472441,
"calib/std_conf": 0.001252438875636762,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9351894818252126,
"calib/step_q_c_n": 1293.0,
"calib/step_q_gap": 0.014822284762789018,
"calib/step_q_w": 0.9203671970624235,
"calib/step_q_w_n": 817.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2409.0,
"completions/max_terminated_length": 2409.0,
"completions/mean_length": 843.8125,
"completions/mean_terminated_length": 847.1216430664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 490.0,
"epoch": 0.1664,
"grad_norm": 0.005306870210915804,
"kl": 0.051036834716796875,
"learning_rate": 1.2222222222222223e-06,
"loss": 0.0101,
"num_tokens": 51178100.0,
"reward": 0.7855024933815002,
"reward_std": 0.25378745794296265,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/asymmetric_l2_reward": 0.7000096440315247,
"rewards/final_brier_reward_step": 0.5631827712059021,
"rewards/format_reward_step": 0.984375,
"step": 156
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5118317581704062,
"calib/avg_num_step_conf": 8.30078125,
"calib/ece": 0.32208000000000014,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00048048481350548844,
"calib/mean_conf": 0.9900800000000002,
"calib/mu_c": 0.9902395209580837,
"calib/mu_w": 0.9897590361445782,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.32208000000000014,
"calib/std_conf": 0.002365079279855119,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9310576271186441,
"calib/step_q_c_n": 1475.0,
"calib/step_q_gap": 0.00818070404172111,
"calib/step_q_w": 0.922876923076923,
"calib/step_q_w_n": 650.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2060.0,
"completions/max_terminated_length": 2060.0,
"completions/mean_length": 822.86328125,
"completions/mean_terminated_length": 839.2550048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 513.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.005075718276202679,
"kl": 0.0534820556640625,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.0414,
"num_tokens": 51492481.0,
"reward": 0.8595537543296814,
"reward_std": 0.16089923679828644,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/asymmetric_l2_reward": 0.7353023886680603,
"rewards/final_brier_reward_step": 0.6588050723075867,
"rewards/format_reward_step": 0.97265625,
"step": 157
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5083156639279088,
"calib/avg_num_step_conf": 8.12890625,
"calib/ece": 0.37880952380952393,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00037105751391453,
"calib/mean_conf": 0.9899206349206351,
"calib/mu_c": 0.9900649350649349,
"calib/mu_w": 0.9896938775510203,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.37880952380952393,
"calib/std_conf": 0.001990466064124518,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9337134502923978,
"calib/step_q_c_n": 1368.0,
"calib/step_q_gap": 0.013082314247516935,
"calib/step_q_w": 0.9206311360448809,
"calib/step_q_w_n": 713.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2763.0,
"completions/max_terminated_length": 2763.0,
"completions/mean_length": 823.91796875,
"completions/mean_terminated_length": 830.405517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 518.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.006082612089812756,
"kl": 0.05602264404296875,
"learning_rate": 1.1666666666666668e-06,
"loss": 0.019,
"num_tokens": 51808644.0,
"reward": 0.8030421733856201,
"reward_std": 0.2842303514480591,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/asymmetric_l2_reward": 0.6795475482940674,
"rewards/final_brier_reward_step": 0.609349250793457,
"rewards/format_reward_step": 0.984375,
"step": 158
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5009805910597146,
"calib/avg_num_step_conf": 8.23828125,
"calib/ece": 0.35916269841269854,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 2.901196997351363e-05,
"calib/mean_conf": 0.9901150793650795,
"calib/mu_c": 0.9901257861635216,
"calib/mu_w": 0.9900967741935481,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.35916269841269854,
"calib/std_conf": 0.0010496838118273031,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.9312385643912737,
"calib/step_q_c_n": 1421.0,
"calib/step_q_gap": 0.011408622530808543,
"calib/step_q_w": 0.9198299418604652,
"calib/step_q_w_n": 688.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2796.0,
"completions/max_terminated_length": 2796.0,
"completions/mean_length": 792.65625,
"completions/mean_terminated_length": 798.8976440429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 566.0,
"epoch": 0.1696,
"grad_norm": 0.005457705818116665,
"kl": 0.049915313720703125,
"learning_rate": 1.138888888888889e-06,
"loss": -0.0069,
"num_tokens": 52116348.0,
"reward": 0.845173180103302,
"reward_std": 0.22259722650051117,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/asymmetric_l2_reward": 0.7504348754882812,
"rewards/final_brier_reward_step": 0.6203800439834595,
"rewards/format_reward_step": 0.9765625,
"step": 159
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5054326610279767,
"calib/avg_num_step_conf": 7.8515625,
"calib/ece": 0.41247011952191226,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00011255692908251902,
"calib/mean_conf": 0.9901593625498007,
"calib/mu_c": 0.990206896551724,
"calib/mu_w": 0.9900943396226415,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.41247011952191226,
"calib/std_conf": 0.0017781604465674436,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.9329748822605967,
"calib/step_q_c_n": 1274.0,
"calib/step_q_gap": 0.01552923008668361,
"calib/step_q_w": 0.917445652173913,
"calib/step_q_w_n": 736.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2869.0,
"completions/max_terminated_length": 2869.0,
"completions/mean_length": 856.1015625,
"completions/mean_terminated_length": 859.4588623046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 560.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.005602055694907904,
"kl": 0.04767608642578125,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0147,
"num_tokens": 52440350.0,
"reward": 0.7861953973770142,
"reward_std": 0.2470768243074417,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/asymmetric_l2_reward": 0.6940505504608154,
"rewards/final_brier_reward_step": 0.5705277323722839,
"rewards/format_reward_step": 0.97265625,
"step": 160
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.500601829561868,
"calib/avg_num_step_conf": 8.19921875,
"calib/ece": 0.2549802371541503,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.2036591237607475e-05,
"calib/mean_conf": 0.9901581027667985,
"calib/mu_c": 0.9901612903225807,
"calib/mu_w": 0.9901492537313431,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2549802371541503,
"calib/std_conf": 0.001247409789569922,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9285210824417873,
"calib/step_q_c_n": 1589.0,
"calib/step_q_gap": 0.009305396167277546,
"calib/step_q_w": 0.9192156862745098,
"calib/step_q_w_n": 510.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2497.0,
"completions/max_terminated_length": 2497.0,
"completions/mean_length": 786.68359375,
"completions/mean_terminated_length": 792.8779296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 433.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.005685119424015284,
"kl": 0.056888580322265625,
"learning_rate": 1.0833333333333335e-06,
"loss": 0.0248,
"num_tokens": 52745661.0,
"reward": 0.9298093318939209,
"reward_std": 0.1548190861940384,
"rewards/accuracy_reward_step": 0.7265625,
"rewards/asymmetric_l2_reward": 0.785028338432312,
"rewards/final_brier_reward_step": 0.7316214442253113,
"rewards/format_reward_step": 0.98828125,
"step": 161
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.512847790507365,
"calib/avg_num_step_conf": 8.0078125,
"calib/ece": 0.246798418972332,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0008756137479541959,
"calib/mean_conf": 0.9898814229249012,
"calib/mu_c": 0.9901063829787233,
"calib/mu_w": 0.9892307692307691,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.246798418972332,
"calib/std_conf": 0.0034984144843255077,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9292870427774333,
"calib/step_q_c_n": 1613.0,
"calib/step_q_gap": 0.014275601129836057,
"calib/step_q_w": 0.9150114416475973,
"calib/step_q_w_n": 437.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1588.0,
"completions/max_terminated_length": 1588.0,
"completions/mean_length": 774.3359375,
"completions/mean_terminated_length": 783.517822265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 481.0,
"epoch": 0.1728,
"grad_norm": 0.00584648922085762,
"kl": 0.0536041259765625,
"learning_rate": 1.0555555555555557e-06,
"loss": -0.0188,
"num_tokens": 53048035.0,
"reward": 0.9473156332969666,
"reward_std": 0.21745626628398895,
"rewards/accuracy_reward_step": 0.734375,
"rewards/asymmetric_l2_reward": 0.8112285733222961,
"rewards/final_brier_reward_step": 0.7396527528762817,
"rewards/format_reward_step": 0.984375,
"step": 162
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5099684168969602,
"calib/avg_num_step_conf": 7.53515625,
"calib/ece": 0.3964940239043825,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0002013422818791799,
"calib/mean_conf": 0.9901195219123506,
"calib/mu_c": 0.990201342281879,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3964940239043825,
"calib/std_conf": 0.0014063237127269169,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9303259141494437,
"calib/step_q_c_n": 1258.0,
"calib/step_q_gap": 0.01974469209281171,
"calib/step_q_w": 0.910581222056632,
"calib/step_q_w_n": 671.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2831.0,
"completions/max_terminated_length": 2831.0,
"completions/mean_length": 826.18359375,
"completions/mean_terminated_length": 835.9802856445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 428.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.005138279404491186,
"kl": 0.048274993896484375,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.0275,
"num_tokens": 53364370.0,
"reward": 0.8028818368911743,
"reward_std": 0.21207211911678314,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/asymmetric_l2_reward": 0.7033613920211792,
"rewards/final_brier_reward_step": 0.5899022817611694,
"rewards/format_reward_step": 0.98046875,
"step": 163
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.508445945945946,
"calib/avg_num_step_conf": 6.8828125,
"calib/ece": 0.38829268292682917,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0003736900165469681,
"calib/mean_conf": 0.9899186991869918,
"calib/mu_c": 0.9900675675675674,
"calib/mu_w": 0.9896938775510205,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.38829268292682917,
"calib/std_conf": 0.002014554746888435,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9237198929527208,
"calib/step_q_c_n": 1121.0,
"calib/step_q_gap": 0.01459352789811863,
"calib/step_q_w": 0.9091263650546022,
"calib/step_q_w_n": 641.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1631.0,
"completions/max_terminated_length": 1631.0,
"completions/mean_length": 794.96875,
"completions/mean_terminated_length": 820.6128540039062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 544.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.005803633015602827,
"kl": 0.04936981201171875,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0688,
"num_tokens": 53674018.0,
"reward": 0.7884545922279358,
"reward_std": 0.2159702479839325,
"rewards/accuracy_reward_step": 0.578125,
"rewards/asymmetric_l2_reward": 0.6831825971603394,
"rewards/final_brier_reward_step": 0.5859140157699585,
"rewards/format_reward_step": 0.9609375,
"step": 164
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5034391534391535,
"calib/avg_num_step_conf": 7.53515625,
"calib/ece": 0.4117269076305222,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 6.94444444444553e-05,
"calib/mean_conf": 0.9900401606425704,
"calib/mu_c": 0.9900694444444444,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4117269076305222,
"calib/std_conf": 0.0010969076533130662,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9288812980358668,
"calib/step_q_c_n": 1171.0,
"calib/step_q_gap": 0.010754648959349633,
"calib/step_q_w": 0.9181266490765172,
"calib/step_q_w_n": 758.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2997.0,
"completions/max_terminated_length": 2997.0,
"completions/mean_length": 816.93359375,
"completions/mean_terminated_length": 826.62060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 505.0,
"epoch": 0.176,
"grad_norm": 0.005772776901721954,
"kl": 0.047878265380859375,
"learning_rate": 9.722222222222224e-07,
"loss": 0.0007,
"num_tokens": 53988729.0,
"reward": 0.7707719802856445,
"reward_std": 0.2285512387752533,
"rewards/accuracy_reward_step": 0.5625,
"rewards/asymmetric_l2_reward": 0.6656252145767212,
"rewards/final_brier_reward_step": 0.570449948310852,
"rewards/format_reward_step": 0.96484375,
"step": 165
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5097144194756553,
"calib/avg_num_step_conf": 7.55078125,
"calib/ece": 0.27800000000000014,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00019506866416951762,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9900561797752808,
"calib/mu_w": 0.9898611111111113,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27800000000000014,
"calib/std_conf": 0.0008944271909999167,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9282830315224683,
"calib/step_q_c_n": 1491.0,
"calib/step_q_gap": 0.02353190030074892,
"calib/step_q_w": 0.9047511312217194,
"calib/step_q_w_n": 442.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2352.0,
"completions/max_terminated_length": 2352.0,
"completions/mean_length": 811.90234375,
"completions/mean_terminated_length": 824.7897338867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 505.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.005784960929304361,
"kl": 0.05084228515625,
"learning_rate": 9.444444444444445e-07,
"loss": -0.0096,
"num_tokens": 54302760.0,
"reward": 0.9043484330177307,
"reward_std": 0.19850796461105347,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/asymmetric_l2_reward": 0.7734046578407288,
"rewards/final_brier_reward_step": 0.7009171843528748,
"rewards/format_reward_step": 0.9765625,
"step": 166
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5173226164079823,
"calib/avg_num_step_conf": 7.82421875,
"calib/ece": 0.339202380952381,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0003431263858091338,
"calib/mean_conf": 0.9899960317460318,
"calib/mu_c": 0.9901158536585365,
"calib/mu_w": 0.9897727272727274,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.339202380952381,
"calib/std_conf": 0.0012295889617531574,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.928510082150859,
"calib/step_q_c_n": 1339.0,
"calib/step_q_gap": 0.00506128697013597,
"calib/step_q_w": 0.923448795180723,
"calib/step_q_w_n": 664.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2766.0,
"completions/max_terminated_length": 2766.0,
"completions/mean_length": 806.5859375,
"completions/mean_terminated_length": 816.1502075195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 476.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.005531142000108957,
"kl": 0.052581787109375,
"learning_rate": 9.166666666666666e-07,
"loss": 0.005,
"num_tokens": 54614854.0,
"reward": 0.8472436666488647,
"reward_std": 0.23025396466255188,
"rewards/accuracy_reward_step": 0.640625,
"rewards/asymmetric_l2_reward": 0.722790002822876,
"rewards/final_brier_reward_step": 0.647478461265564,
"rewards/format_reward_step": 0.98046875,
"step": 167
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.0234375,
"calib/ece": 0.2864426877470355,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 2.220446049250313e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2864426877470355,
"calib/std_conf": 0.000889108448948775,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9222430607651912,
"calib/step_q_c_n": 1333.0,
"calib/step_q_gap": 0.022200050012503136,
"calib/step_q_w": 0.9000430107526881,
"calib/step_q_w_n": 465.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1629.0,
"completions/max_terminated_length": 1629.0,
"completions/mean_length": 830.3984375,
"completions/mean_terminated_length": 840.2451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 524.0,
"epoch": 0.1792,
"grad_norm": 0.006028932053595781,
"kl": 0.04668426513671875,
"learning_rate": 8.88888888888889e-07,
"loss": 0.0018,
"num_tokens": 54932108.0,
"reward": 0.8952298164367676,
"reward_std": 0.20066331326961517,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/asymmetric_l2_reward": 0.752668559551239,
"rewards/final_brier_reward_step": 0.7010722160339355,
"rewards/format_reward_step": 0.98828125,
"step": 168
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.91796875,
"calib/ece": 0.3892094861660079,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.1102230246251565e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.9900000000000001,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3892094861660079,
"calib/std_conf": 0.000889108448948775,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9278112609040444,
"calib/step_q_c_n": 1261.0,
"calib/step_q_gap": 0.01910368910247795,
"calib/step_q_w": 0.9087075718015665,
"calib/step_q_w_n": 766.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2466.0,
"completions/max_terminated_length": 2466.0,
"completions/mean_length": 823.63671875,
"completions/mean_terminated_length": 830.1220703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 541.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.005781630985438824,
"kl": 0.04717254638671875,
"learning_rate": 8.611111111111112e-07,
"loss": 0.018,
"num_tokens": 55247143.0,
"reward": 0.8189793825149536,
"reward_std": 0.17103320360183716,
"rewards/accuracy_reward_step": 0.59375,
"rewards/asymmetric_l2_reward": 0.7200114727020264,
"rewards/final_brier_reward_step": 0.6015409827232361,
"rewards/format_reward_step": 0.98828125,
"step": 169
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.511520737327189,
"calib/avg_num_step_conf": 7.6484375,
"calib/ece": 0.3561811023622047,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0002317504842052065,
"calib/mean_conf": 0.9900393700787401,
"calib/mu_c": 0.9901242236024846,
"calib/mu_w": 0.9898924731182794,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3561811023622047,
"calib/std_conf": 0.0010860719861522626,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9301465201465203,
"calib/step_q_c_n": 1365.0,
"calib/step_q_gap": 0.014278054716503452,
"calib/step_q_w": 0.9158684654300169,
"calib/step_q_w_n": 593.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2043.0,
"completions/max_terminated_length": 2043.0,
"completions/mean_length": 824.89453125,
"completions/mean_terminated_length": 828.1294555664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.14566980302333832,
"kl": 0.09648895263671875,
"learning_rate": 8.333333333333333e-07,
"loss": -0.0184,
"num_tokens": 55562468.0,
"reward": 0.8437641263008118,
"reward_std": 0.2100234031677246,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/asymmetric_l2_reward": 0.7280181050300598,
"rewards/final_brier_reward_step": 0.6360726356506348,
"rewards/format_reward_step": 0.98828125,
"step": 170
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.49199641439364833,
"calib/avg_num_step_conf": 7.859375,
"calib/ece": 0.44418326693227095,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0001607119989756045,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.9899270072992701,
"calib/mu_w": 0.9900877192982457,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.44418326693227095,
"calib/std_conf": 0.0008926436853549044,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9300573300573299,
"calib/step_q_c_n": 1221.0,
"calib/step_q_gap": 0.015556697946078168,
"calib/step_q_w": 0.9145006321112518,
"calib/step_q_w_n": 791.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1726.0,
"completions/max_terminated_length": 1726.0,
"completions/mean_length": 787.0546875,
"completions/mean_terminated_length": 799.5476684570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 494.0,
"epoch": 0.1824,
"grad_norm": 0.006573774851858616,
"kl": 0.046173095703125,
"learning_rate": 8.055555555555557e-07,
"loss": -0.0112,
"num_tokens": 55870850.0,
"reward": 0.7430707216262817,
"reward_std": 0.2690494954586029,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/asymmetric_l2_reward": 0.6438180208206177,
"rewards/final_brier_reward_step": 0.5399796962738037,
"rewards/format_reward_step": 0.9765625,
"step": 171
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4972972972972973,
"calib/avg_num_step_conf": 8.41015625,
"calib/ece": 0.2586561264822135,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00016216216216202728,
"calib/mean_conf": 0.9898814229249012,
"calib/mu_c": 0.9898378378378379,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2586561264822135,
"calib/std_conf": 0.0018823527114293543,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.933501213592233,
"calib/step_q_c_n": 1648.0,
"calib/step_q_gap": 0.011223985869460718,
"calib/step_q_w": 0.9222772277227723,
"calib/step_q_w_n": 505.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2312.0,
"completions/max_terminated_length": 2312.0,
"completions/mean_length": 815.2265625,
"completions/mean_terminated_length": 821.6456909179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 562.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.006808638572692871,
"kl": 0.05686187744140625,
"learning_rate": 7.777777777777779e-07,
"loss": -0.0047,
"num_tokens": 56182900.0,
"reward": 0.9279036521911621,
"reward_std": 0.18457263708114624,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/asymmetric_l2_reward": 0.7857556343078613,
"rewards/final_brier_reward_step": 0.7278640270233154,
"rewards/format_reward_step": 0.98828125,
"step": 172
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.49678574037548395,
"calib/avg_num_step_conf": 8.375,
"calib/ece": 0.31408,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -6.428519249035425e-05,
"calib/mean_conf": 0.9900800000000001,
"calib/mu_c": 0.9900591715976331,
"calib/mu_w": 0.9901234567901235,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.31408,
"calib/std_conf": 0.0008908422980528043,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.9305886475122636,
"calib/step_q_c_n": 1427.0,
"calib/step_q_gap": 0.002834114736810167,
"calib/step_q_w": 0.9277545327754534,
"calib/step_q_w_n": 717.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2817.0,
"completions/max_terminated_length": 2817.0,
"completions/mean_length": 838.22265625,
"completions/mean_terminated_length": 844.8228149414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 457.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.0054668583907186985,
"kl": 0.051910400390625,
"learning_rate": 7.5e-07,
"loss": 0.0027,
"num_tokens": 56500645.0,
"reward": 0.8591402769088745,
"reward_std": 0.2172548919916153,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/asymmetric_l2_reward": 0.7247052192687988,
"rewards/final_brier_reward_step": 0.666231632232666,
"rewards/format_reward_step": 0.97265625,
"step": 173
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.4963296629963297,
"calib/avg_num_step_conf": 7.8203125,
"calib/ece": 0.44117886178861787,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -7.407407407389321e-05,
"calib/mean_conf": 0.989959349593496,
"calib/mu_c": 0.9899259259259259,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.44117886178861787,
"calib/std_conf": 0.0011035668260567673,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9252323420074351,
"calib/step_q_c_n": 1076.0,
"calib/step_q_gap": 0.002132989955599296,
"calib/step_q_w": 0.9230993520518358,
"calib/step_q_w_n": 926.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2459.0,
"completions/max_terminated_length": 2459.0,
"completions/mean_length": 890.26953125,
"completions/mean_terminated_length": 904.4008178710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 549.0,
"epoch": 0.1856,
"grad_norm": 0.00568380206823349,
"kl": 0.0473785400390625,
"learning_rate": 7.222222222222222e-07,
"loss": -0.0077,
"num_tokens": 56832786.0,
"reward": 0.7364821434020996,
"reward_std": 0.29423683881759644,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/asymmetric_l2_reward": 0.6393903493881226,
"rewards/final_brier_reward_step": 0.5359175205230713,
"rewards/format_reward_step": 0.9609375,
"step": 174
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5044642857142857,
"calib/avg_num_step_conf": 7.99609375,
"calib/ece": 0.5420400000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 8.92857142859027e-05,
"calib/mean_conf": 0.99004,
"calib/mu_c": 0.9900892857142859,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.5420400000000001,
"calib/std_conf": 0.0006311893535223806,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.9312725450901804,
"calib/step_q_c_n": 998.0,
"calib/step_q_gap": 0.007669113250332926,
"calib/step_q_w": 0.9236034318398475,
"calib/step_q_w_n": 1049.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2381.0,
"completions/max_terminated_length": 2381.0,
"completions/mean_length": 828.3046875,
"completions/mean_terminated_length": 841.4524536132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 472.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.00572630763053894,
"kl": 0.0507659912109375,
"learning_rate": 6.944444444444446e-07,
"loss": -0.0266,
"num_tokens": 57150656.0,
"reward": 0.6720933318138123,
"reward_std": 0.26687222719192505,
"rewards/accuracy_reward_step": 0.4375,
"rewards/asymmetric_l2_reward": 0.618736207485199,
"rewards/final_brier_reward_step": 0.444200336933136,
"rewards/format_reward_step": 0.96875,
"step": 175
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5012851731601731,
"calib/avg_num_step_conf": 8.3828125,
"calib/ece": 0.3741200000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 2.5703463203297083e-05,
"calib/mean_conf": 0.9901200000000001,
"calib/mu_c": 0.9901298701298701,
"calib/mu_w": 0.9901041666666668,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3741200000000001,
"calib/std_conf": 0.001088852607105297,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9325920577617329,
"calib/step_q_c_n": 1385.0,
"calib/step_q_gap": 0.009280625435845957,
"calib/step_q_w": 0.923311432325887,
"calib/step_q_w_n": 761.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2884.0,
"completions/max_terminated_length": 2884.0,
"completions/mean_length": 824.0625,
"completions/mean_terminated_length": 833.8340454101562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 522.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.005427682306617498,
"kl": 0.047412872314453125,
"learning_rate": 6.666666666666667e-07,
"loss": -0.0152,
"num_tokens": 57465680.0,
"reward": 0.8262530565261841,
"reward_std": 0.2361697107553482,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/asymmetric_l2_reward": 0.733539342880249,
"rewards/final_brier_reward_step": 0.604904294013977,
"rewards/format_reward_step": 0.96875,
"step": 176
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5104166666666667,
"calib/avg_num_step_conf": 8.49609375,
"calib/ece": 0.36639215686274507,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00020833333333303283,
"calib/mean_conf": 0.989921568627451,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.9897916666666667,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36639215686274507,
"calib/std_conf": 0.0008821350493491767,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9363442940038684,
"calib/step_q_c_n": 1551.0,
"calib/step_q_gap": 0.024469294003868436,
"calib/step_q_w": 0.911875,
"calib/step_q_w_n": 624.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1973.0,
"completions/max_terminated_length": 1973.0,
"completions/mean_length": 828.94921875,
"completions/mean_terminated_length": 828.94921875,
"completions/min_length": 529.0,
"completions/min_terminated_length": 529.0,
"epoch": 0.1888,
"grad_norm": 0.006155412178486586,
"kl": 0.05728912353515625,
"learning_rate": 6.388888888888889e-07,
"loss": -0.0037,
"num_tokens": 57781723.0,
"reward": 0.8317885398864746,
"reward_std": 0.20242759585380554,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/asymmetric_l2_reward": 0.7114914655685425,
"rewards/final_brier_reward_step": 0.6286479830741882,
"rewards/format_reward_step": 0.99609375,
"step": 177
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.493632504548211,
"calib/avg_num_step_conf": 8.33984375,
"calib/ece": 0.3778800000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -2.7626170743255685e-05,
"calib/mean_conf": 0.9898800000000001,
"calib/mu_c": 0.9898692810457516,
"calib/mu_w": 0.9898969072164948,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3778800000000001,
"calib/std_conf": 0.0018935680605671416,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9328140013726837,
"calib/step_q_c_n": 1457.0,
"calib/step_q_gap": 0.019878898127845823,
"calib/step_q_w": 0.9129351032448378,
"calib/step_q_w_n": 678.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2758.0,
"completions/max_terminated_length": 2758.0,
"completions/mean_length": 804.12109375,
"completions/mean_terminated_length": 823.4200439453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 535.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.005673910956829786,
"kl": 0.0516510009765625,
"learning_rate": 6.111111111111112e-07,
"loss": -0.032,
"num_tokens": 58093650.0,
"reward": 0.8042181134223938,
"reward_std": 0.26732301712036133,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/asymmetric_l2_reward": 0.6892424821853638,
"rewards/final_brier_reward_step": 0.6051312685012817,
"rewards/format_reward_step": 0.97265625,
"step": 178
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5032679738562091,
"calib/avg_num_step_conf": 8.484375,
"calib/ece": 0.37804000000000004,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 6.535947712427159e-05,
"calib/mean_conf": 0.99004,
"calib/mu_c": 0.9900653594771243,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.37804000000000004,
"calib/std_conf": 0.0006311893535223806,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9356678200692042,
"calib/step_q_c_n": 1445.0,
"calib/step_q_gap": 0.022737971375944244,
"calib/step_q_w": 0.91292984869326,
"calib/step_q_w_n": 727.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2893.0,
"completions/max_terminated_length": 2893.0,
"completions/mean_length": 834.37109375,
"completions/mean_terminated_length": 847.6151123046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 526.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.005824382416903973,
"kl": 0.055938720703125,
"learning_rate": 5.833333333333334e-07,
"loss": -0.0158,
"num_tokens": 58413513.0,
"reward": 0.8095970153808594,
"reward_std": 0.23507192730903625,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/asymmetric_l2_reward": 0.7009310722351074,
"rewards/final_brier_reward_step": 0.6049816012382507,
"rewards/format_reward_step": 0.96875,
"step": 179
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.1328125,
"calib/ece": 0.37095238095238103,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.1102230246251565e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9900000000000001,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.37095238095238103,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9301554770318021,
"calib/step_q_c_n": 1415.0,
"calib/step_q_gap": 0.01328891031516033,
"calib/step_q_w": 0.9168665667166418,
"calib/step_q_w_n": 667.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1975.0,
"completions/max_terminated_length": 1975.0,
"completions/mean_length": 851.38671875,
"completions/mean_terminated_length": 864.9008178710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 484.0,
"epoch": 0.192,
"grad_norm": 0.00597095163539052,
"kl": 0.0483551025390625,
"learning_rate": 5.555555555555555e-07,
"loss": -0.0073,
"num_tokens": 58735324.0,
"reward": 0.8375976085662842,
"reward_std": 0.21114030480384827,
"rewards/accuracy_reward_step": 0.609375,
"rewards/asymmetric_l2_reward": 0.7396686673164368,
"rewards/final_brier_reward_step": 0.6167765855789185,
"rewards/format_reward_step": 0.984375,
"step": 180
},
{
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.49568965517241376,
"calib/avg_num_step_conf": 8.515625,
"calib/ece": 0.4449411764705882,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -8.620689655181035e-05,
"calib/mean_conf": 0.9900392156862744,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9900862068965517,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4449411764705882,
"calib/std_conf": 0.0010839431342027658,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9334858044164037,
"calib/step_q_c_n": 1268.0,
"calib/step_q_gap": 0.0100647517848248,
"calib/step_q_w": 0.9234210526315789,
"calib/step_q_w_n": 912.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2102.0,
"completions/max_terminated_length": 2102.0,
"completions/mean_length": 815.78125,
"completions/mean_terminated_length": 818.98046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 491.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.006558713968843222,
"kl": 0.0558013916015625,
"learning_rate": 5.277777777777779e-07,
"loss": 0.0032,
"num_tokens": 59050428.0,
"reward": 0.7611607313156128,
"reward_std": 0.2021130919456482,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/asymmetric_l2_reward": 0.6626558899879456,
"rewards/final_brier_reward_step": 0.5518530607223511,
"rewards/format_reward_step": 0.99609375,
"step": 181
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.49774865591397854,
"calib/avg_num_step_conf": 9.03515625,
"calib/ece": 0.35766798418972334,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -4.502688171992286e-05,
"calib/mean_conf": 0.9900790513833992,
"calib/mu_c": 0.9900625000000002,
"calib/mu_w": 0.9901075268817201,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.35766798418972334,
"calib/std_conf": 0.0008855872135339169,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9366816431322209,
"calib/step_q_c_n": 1558.0,
"calib/step_q_gap": 0.014006146443479195,
"calib/step_q_w": 0.9226754966887417,
"calib/step_q_w_n": 755.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1666.0,
"completions/max_terminated_length": 1666.0,
"completions/mean_length": 819.6796875,
"completions/mean_terminated_length": 829.3992309570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 575.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.0055109853856265545,
"kl": 0.0526275634765625,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0121,
"num_tokens": 59366426.0,
"reward": 0.818576455116272,
"reward_std": 0.22150349617004395,
"rewards/accuracy_reward_step": 0.625,
"rewards/asymmetric_l2_reward": 0.6832661628723145,
"rewards/final_brier_reward_step": 0.6320116519927979,
"rewards/format_reward_step": 0.984375,
"step": 182
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5013616557734205,
"calib/avg_num_step_conf": 8.35546875,
"calib/ece": 0.3756626506024098,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 2.655228758163819e-05,
"calib/mean_conf": 0.990120481927711,
"calib/mu_c": 0.9901307189542484,
"calib/mu_w": 0.9901041666666668,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3756626506024098,
"calib/std_conf": 0.001411919343875258,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9352155771905425,
"calib/step_q_c_n": 1438.0,
"calib/step_q_gap": 0.019224136391683855,
"calib/step_q_w": 0.9159914407988586,
"calib/step_q_w_n": 701.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2902.0,
"completions/max_terminated_length": 2902.0,
"completions/mean_length": 866.56640625,
"completions/mean_terminated_length": 880.3214721679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 557.0,
"epoch": 0.1952,
"grad_norm": 0.00530533492565155,
"kl": 0.05149078369140625,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.014,
"num_tokens": 59694947.0,
"reward": 0.8005015850067139,
"reward_std": 0.2967371344566345,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/asymmetric_l2_reward": 0.6819595098495483,
"rewards/final_brier_reward_step": 0.6049812436103821,
"rewards/format_reward_step": 0.97265625,
"step": 183
},
{
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.503170664460987,
"calib/avg_num_step_conf": 8.55078125,
"calib/ece": 0.3635341365461848,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 6.41025641027193e-05,
"calib/mean_conf": 0.9900401606425704,
"calib/mu_c": 0.9900641025641025,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3635341365461848,
"calib/std_conf": 0.0010969076533130662,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9356889204545457,
"calib/step_q_c_n": 1408.0,
"calib/step_q_gap": 0.009888664372599498,
"calib/step_q_w": 0.9258002560819462,
"calib/step_q_w_n": 781.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 1398.0,
"completions/max_terminated_length": 1398.0,
"completions/mean_length": 827.08203125,
"completions/mean_terminated_length": 850.3333129882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 585.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.005826389882713556,
"kl": 0.0535430908203125,
"learning_rate": 4.444444444444445e-07,
"loss": -0.0479,
"num_tokens": 60011960.0,
"reward": 0.821940541267395,
"reward_std": 0.22585231065750122,
"rewards/accuracy_reward_step": 0.609375,
"rewards/asymmetric_l2_reward": 0.7109317779541016,
"rewards/final_brier_reward_step": 0.6165429353713989,
"rewards/format_reward_step": 0.97265625,
"step": 184
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5150070126227209,
"calib/avg_num_step_conf": 8.953125,
"calib/ece": 0.36255060728744926,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0003022440392704828,
"calib/mean_conf": 0.9900809716599189,
"calib/mu_c": 0.9901935483870965,
"calib/mu_w": 0.989891304347826,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.36255060728744926,
"calib/std_conf": 0.001269990861648432,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.9383408360128617,
"calib/step_q_c_n": 1555.0,
"calib/step_q_gap": 0.01623771525302442,
"calib/step_q_w": 0.9221031207598372,
"calib/step_q_w_n": 737.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2964.0,
"completions/max_terminated_length": 2964.0,
"completions/mean_length": 853.453125,
"completions/mean_terminated_length": 870.4542236328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.19733333333333333,
"grad_norm": 9127.7880859375,
"kl": 41984.04993057251,
"learning_rate": 4.1666666666666667e-07,
"loss": 419.4765,
"num_tokens": 60337364.0,
"reward": 0.7966436147689819,
"reward_std": 0.2328513264656067,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/asymmetric_l2_reward": 0.6721329689025879,
"rewards/final_brier_reward_step": 0.608654260635376,
"rewards/format_reward_step": 0.95703125,
"step": 185
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5034246575342466,
"calib/avg_num_step_conf": 8.96875,
"calib/ece": 0.4083665338645418,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 6.849315068480077e-05,
"calib/mean_conf": 0.9900398406374502,
"calib/mu_c": 0.9900684931506848,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4083665338645418,
"calib/std_conf": 0.0006299357888781636,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.938288770053476,
"calib/step_q_c_n": 1496.0,
"calib/step_q_gap": 0.015901270053475858,
"calib/step_q_w": 0.9223875000000001,
"calib/step_q_w_n": 800.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1551.0,
"completions/max_terminated_length": 1551.0,
"completions/mean_length": 824.5703125,
"completions/mean_terminated_length": 840.9960327148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 501.0,
"epoch": 0.1984,
"grad_norm": 0.005916334688663483,
"kl": 0.0541534423828125,
"learning_rate": 3.8888888888888895e-07,
"loss": -0.0308,
"num_tokens": 60653494.0,
"reward": 0.7912920713424683,
"reward_std": 0.23391059041023254,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/asymmetric_l2_reward": 0.6940099000930786,
"rewards/final_brier_reward_step": 0.578417956829071,
"rewards/format_reward_step": 0.98046875,
"step": 186
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5054662802950475,
"calib/avg_num_step_conf": 8.98046875,
"calib/ece": 0.4061600000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00010932560590093399,
"calib/mean_conf": 0.99016,
"calib/mu_c": 0.9902054794520547,
"calib/mu_w": 0.9900961538461538,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4061600000000001,
"calib/std_conf": 0.001254750971308651,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9334121122599706,
"calib/step_q_c_n": 1354.0,
"calib/step_q_gap": 0.010163435011293243,
"calib/step_q_w": 0.9232486772486773,
"calib/step_q_w_n": 945.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1979.0,
"completions/max_terminated_length": 1979.0,
"completions/mean_length": 862.390625,
"completions/mean_terminated_length": 872.6166381835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.005608111619949341,
"kl": 0.0528106689453125,
"learning_rate": 3.611111111111111e-07,
"loss": -0.0251,
"num_tokens": 60975810.0,
"reward": 0.7882611751556396,
"reward_std": 0.30219972133636475,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/asymmetric_l2_reward": 0.6888840198516846,
"rewards/final_brier_reward_step": 0.5782632827758789,
"rewards/format_reward_step": 0.9765625,
"step": 187
},
{
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5011998870694523,
"calib/avg_num_step_conf": 8.52734375,
"calib/ece": 0.3641016260162603,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 3.204404291368146e-05,
"calib/mean_conf": 0.9901178861788619,
"calib/mu_c": 0.9901298701298701,
"calib/mu_w": 0.9900978260869564,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3641016260162603,
"calib/std_conf": 0.001393336786882087,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9348505906879777,
"calib/step_q_c_n": 1439.0,
"calib/step_q_gap": 0.0140360745589454,
"calib/step_q_w": 0.9208145161290323,
"calib/step_q_w_n": 744.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2176.0,
"completions/max_terminated_length": 2176.0,
"completions/mean_length": 859.84765625,
"completions/mean_terminated_length": 876.9761352539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 463.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.0053206272423267365,
"kl": 0.05606842041015625,
"learning_rate": 3.3333333333333335e-07,
"loss": -0.0046,
"num_tokens": 61300003.0,
"reward": 0.7975469827651978,
"reward_std": 0.22008016705513,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/asymmetric_l2_reward": 0.6786959767341614,
"rewards/final_brier_reward_step": 0.6046792268753052,
"rewards/format_reward_step": 0.95703125,
"step": 188
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.59765625,
"calib/ece": 0.42027888446215134,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.1102230246251565e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.42027888446215134,
"calib/std_conf": 0.0008926436853549044,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9347464042392128,
"calib/step_q_c_n": 1321.0,
"calib/step_q_gap": 0.00949640423921283,
"calib/step_q_w": 0.92525,
"calib/step_q_w_n": 880.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2781.0,
"completions/max_terminated_length": 2781.0,
"completions/mean_length": 836.6640625,
"completions/mean_terminated_length": 846.5850219726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 520.0,
"epoch": 0.2016,
"grad_norm": 0.005423042923212051,
"kl": 0.05489349365234375,
"learning_rate": 3.055555555555556e-07,
"loss": 0.0028,
"num_tokens": 61621957.0,
"reward": 0.7671196460723877,
"reward_std": 0.25127676129341125,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/asymmetric_l2_reward": 0.6641814112663269,
"rewards/final_brier_reward_step": 0.5630265474319458,
"rewards/format_reward_step": 0.9765625,
"step": 189
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5020397756246813,
"calib/avg_num_step_conf": 8.48828125,
"calib/ece": 0.40744094488188976,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 4.079551249336344e-05,
"calib/mean_conf": 0.9901181102362204,
"calib/mu_c": 0.9901351351351348,
"calib/mu_w": 0.9900943396226415,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.40744094488188976,
"calib/std_conf": 0.0010803482467726185,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.934004491017964,
"calib/step_q_c_n": 1336.0,
"calib/step_q_gap": 0.012140691734809916,
"calib/step_q_w": 0.9218637992831541,
"calib/step_q_w_n": 837.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2351.0,
"completions/max_terminated_length": 2351.0,
"completions/mean_length": 866.859375,
"completions/mean_terminated_length": 873.68505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 502.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.005208548624068499,
"kl": 0.05187225341796875,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.025,
"num_tokens": 61949481.0,
"reward": 0.7925498485565186,
"reward_std": 0.19229495525360107,
"rewards/accuracy_reward_step": 0.578125,
"rewards/asymmetric_l2_reward": 0.6856661438941956,
"rewards/final_brier_reward_step": 0.5861523151397705,
"rewards/format_reward_step": 0.98828125,
"step": 190
},
{
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5195272353545735,
"calib/avg_num_step_conf": 9.359375,
"calib/ece": 0.4362509960159364,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0004033273381293112,
"calib/mean_conf": 0.9900358565737053,
"calib/mu_c": 0.9902158273381294,
"calib/mu_w": 0.9898125000000001,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4362509960159364,
"calib/std_conf": 0.001646773122345849,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9380392156862745,
"calib/step_q_c_n": 1428.0,
"calib/step_q_gap": 0.010193141306109044,
"calib/step_q_w": 0.9278460743801654,
"calib/step_q_w_n": 968.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2820.0,
"completions/max_terminated_length": 2820.0,
"completions/mean_length": 833.29296875,
"completions/mean_terminated_length": 846.5198974609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 561.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.005339369177818298,
"kl": 0.0585479736328125,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.026,
"num_tokens": 62266972.0,
"reward": 0.746340811252594,
"reward_std": 0.15258918702602386,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/asymmetric_l2_reward": 0.6362113356590271,
"rewards/final_brier_reward_step": 0.5517827868461609,
"rewards/format_reward_step": 0.98046875,
"step": 191
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5064236838430386,
"calib/avg_num_step_conf": 8.4140625,
"calib/ece": 0.3665991902834007,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0001298701298703131,
"calib/mean_conf": 0.9900809716599189,
"calib/mu_c": 0.9901298701298701,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3665991902834007,
"calib/std_conf": 0.0012699908616484322,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.936250852079073,
"calib/step_q_c_n": 1467.0,
"calib/step_q_gap": 0.01818680549974272,
"calib/step_q_w": 0.9180640465793303,
"calib/step_q_w_n": 687.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2709.0,
"completions/max_terminated_length": 2709.0,
"completions/mean_length": 837.3828125,
"completions/mean_terminated_length": 864.3951416015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 270.0,
"epoch": 0.2048,
"grad_norm": 0.0058121588081121445,
"kl": 0.05686187744140625,
"learning_rate": 2.2222222222222224e-07,
"loss": -0.046,
"num_tokens": 62586318.0,
"reward": 0.7954423427581787,
"reward_std": 0.2760905623435974,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/asymmetric_l2_reward": 0.6782459616661072,
"rewards/final_brier_reward_step": 0.6009199023246765,
"rewards/format_reward_step": 0.95703125,
"step": 192
},
{
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.49137931034482757,
"calib/avg_num_step_conf": 8.49609375,
"calib/ece": 0.4485770750988143,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00017241379310328764,
"calib/mean_conf": 0.9900790513833992,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9901724137931033,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4485770750988143,
"calib/std_conf": 0.000885587213533917,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9332434640522875,
"calib/step_q_c_n": 1224.0,
"calib/step_q_gap": 0.005882791076472493,
"calib/step_q_w": 0.927360672975815,
"calib/step_q_w_n": 951.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1863.0,
"completions/max_terminated_length": 1863.0,
"completions/mean_length": 850.6171875,
"completions/mean_terminated_length": 860.70361328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 501.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.005852544214576483,
"kl": 0.05562591552734375,
"learning_rate": 1.9444444444444447e-07,
"loss": 0.0044,
"num_tokens": 62909788.0,
"reward": 0.7509139776229858,
"reward_std": 0.2576588988304138,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/asymmetric_l2_reward": 0.6531758904457092,
"rewards/final_brier_reward_step": 0.5439644455909729,
"rewards/format_reward_step": 0.98828125,
"step": 193
},
{
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5127151799687011,
"calib/avg_num_step_conf": 8.734375,
"calib/ece": 0.4219600000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0002556077203963447,
"calib/mean_conf": 0.9899600000000001,
"calib/mu_c": 0.990070422535211,
"calib/mu_w": 0.9898148148148147,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4219600000000001,
"calib/std_conf": 0.0010947145746723218,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9353106508875739,
"calib/step_q_c_n": 1352.0,
"calib/step_q_gap": 0.016475809258614693,
"calib/step_q_w": 0.9188348416289592,
"calib/step_q_w_n": 884.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2210.0,
"completions/max_terminated_length": 2210.0,
"completions/mean_length": 812.16015625,
"completions/mean_terminated_length": 828.3386840820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 492.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.00551479822024703,
"kl": 0.05739593505859375,
"learning_rate": 1.6666666666666668e-07,
"loss": -0.0216,
"num_tokens": 63223645.0,
"reward": 0.7771685123443604,
"reward_std": 0.19046615064144135,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/asymmetric_l2_reward": 0.6849054098129272,
"rewards/final_brier_reward_step": 0.5631815791130066,
"rewards/format_reward_step": 0.9765625,
"step": 194
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.49879275653923544,
"calib/avg_num_step_conf": 8.5390625,
"calib/ece": 0.41517813765182177,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.5291750503299006e-05,
"calib/mean_conf": 0.990076923076923,
"calib/mu_c": 0.990070422535211,
"calib/mu_w": 0.9900857142857143,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.41517813765182177,
"calib/std_conf": 0.0008525704443058614,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9347727272727274,
"calib/step_q_c_n": 1364.0,
"calib/step_q_gap": 0.016338420703384426,
"calib/step_q_w": 0.918434306569343,
"calib/step_q_w_n": 822.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2997.0,
"completions/max_terminated_length": 2997.0,
"completions/mean_length": 833.63671875,
"completions/mean_terminated_length": 850.2430419921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 561.0,
"epoch": 0.208,
"grad_norm": 0.005558379925787449,
"kl": 0.05661773681640625,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.0177,
"num_tokens": 63543040.0,
"reward": 0.7495644688606262,
"reward_std": 0.2524225115776062,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/asymmetric_l2_reward": 0.633357048034668,
"rewards/final_brier_reward_step": 0.5626468658447266,
"rewards/format_reward_step": 0.9609375,
"step": 195
},
{
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5098039215686274,
"calib/avg_num_step_conf": 9.87890625,
"calib/ece": 0.38537549407114624,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00019607843137270375,
"calib/mean_conf": 0.9901185770750988,
"calib/mu_c": 0.9901960784313726,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.38537549407114624,
"calib/std_conf": 0.001082455647243412,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9403726708074536,
"calib/step_q_c_n": 1610.0,
"calib/step_q_gap": 0.008109341101251166,
"calib/step_q_w": 0.9322633297062024,
"calib/step_q_w_n": 919.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1310.0,
"completions/max_terminated_length": 1310.0,
"completions/mean_length": 776.609375,
"completions/mean_terminated_length": 782.7244262695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 432.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.005445533432066441,
"kl": 0.0676422119140625,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0158,
"num_tokens": 63844396.0,
"reward": 0.8032974004745483,
"reward_std": 0.16581255197525024,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/asymmetric_l2_reward": 0.6832548379898071,
"rewards/final_brier_reward_step": 0.6053711175918579,
"rewards/format_reward_step": 0.98828125,
"step": 196
},
{
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.4868421052631579,
"calib/avg_num_step_conf": 8.44140625,
"calib/ece": 0.45165991902834,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00026315789473696505,
"calib/mean_conf": 0.9901214574898785,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9902631578947368,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.45165991902834,
"calib/std_conf": 0.0010953643124266147,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9346570972886762,
"calib/step_q_c_n": 1254.0,
"calib/step_q_gap": 0.013863271489337636,
"calib/step_q_w": 0.9207938257993386,
"calib/step_q_w_n": 907.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2790.0,
"completions/max_terminated_length": 2790.0,
"completions/mean_length": 829.27734375,
"completions/mean_terminated_length": 852.59033203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 539.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.005342363379895687,
"kl": 0.056148529052734375,
"learning_rate": 8.333333333333334e-08,
"loss": -0.0289,
"num_tokens": 64161747.0,
"reward": 0.7426227331161499,
"reward_std": 0.24280357360839844,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/asymmetric_l2_reward": 0.6602627038955688,
"rewards/final_brier_reward_step": 0.528107762336731,
"rewards/format_reward_step": 0.96484375,
"step": 197
},
{
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5008438818565402,
"calib/avg_num_step_conf": 9.2109375,
"calib/ece": 0.3530080645161291,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 5.991561181417637e-05,
"calib/mean_conf": 0.9901048387096775,
"calib/mu_c": 0.9901265822784808,
"calib/mu_w": 0.9900666666666667,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3530080645161291,
"calib/std_conf": 0.0009698565606206837,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9380546923555003,
"calib/step_q_c_n": 1609.0,
"calib/step_q_gap": 0.014708897962976919,
"calib/step_q_w": 0.9233457943925234,
"calib/step_q_w_n": 749.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 3010.0,
"completions/max_terminated_length": 3010.0,
"completions/mean_length": 821.0703125,
"completions/mean_terminated_length": 844.152587890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 505.0,
"epoch": 0.2112,
"grad_norm": 0.005194542929530144,
"kl": 0.06021881103515625,
"learning_rate": 5.555555555555556e-08,
"loss": -0.0094,
"num_tokens": 64477325.0,
"reward": 0.8296013474464417,
"reward_std": 0.24484550952911377,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/asymmetric_l2_reward": 0.7179390788078308,
"rewards/final_brier_reward_step": 0.6240760684013367,
"rewards/format_reward_step": 0.96875,
"step": 198
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.49473684210526314,
"calib/avg_num_step_conf": 8.80859375,
"calib/ece": 0.36553359683794473,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00010526315789449736,
"calib/mean_conf": 0.9900395256916996,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.9901052631578943,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36553359683794473,
"calib/std_conf": 0.0006274509038097849,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9359807692307693,
"calib/step_q_c_n": 1560.0,
"calib/step_q_gap": 0.014700193691201013,
"calib/step_q_w": 0.9212805755395683,
"calib/step_q_w_n": 695.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2158.0,
"completions/max_terminated_length": 2158.0,
"completions/mean_length": 862.57421875,
"completions/mean_terminated_length": 869.3661499023438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 416.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.005453071091324091,
"kl": 0.05690765380859375,
"learning_rate": 2.777777777777778e-08,
"loss": -0.0268,
"num_tokens": 64802344.0,
"reward": 0.8347955942153931,
"reward_std": 0.24365174770355225,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/asymmetric_l2_reward": 0.7249236106872559,
"rewards/final_brier_reward_step": 0.6243550181388855,
"rewards/format_reward_step": 0.984375,
"step": 199
},
{
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5115928831605492,
"calib/avg_num_step_conf": 8.40625,
"calib/ece": 0.33111111111111124,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.000465116279070088,
"calib/mean_conf": 0.98984126984127,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9895348837209298,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.33111111111111124,
"calib/std_conf": 0.0021763982858416025,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9363337547408344,
"calib/step_q_c_n": 1582.0,
"calib/step_q_gap": 0.02156182491627301,
"calib/step_q_w": 0.9147719298245613,
"calib/step_q_w_n": 570.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1801.0,
"completions/max_terminated_length": 1801.0,
"completions/mean_length": 853.05859375,
"completions/mean_terminated_length": 859.7755737304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 422.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.005459709092974663,
"kl": 0.05413818359375,
"learning_rate": 0.0,
"loss": -0.0148,
"num_tokens": 65128775.0,
"reward": 0.8560566306114197,
"reward_std": 0.16525955498218536,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/asymmetric_l2_reward": 0.7319831848144531,
"rewards/final_brier_reward_step": 0.6551300287246704,
"rewards/format_reward_step": 0.9765625,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": 2.0760135896515566,
"train_runtime": 19368.0064,
"train_samples_per_second": 2.644,
"train_steps_per_second": 0.01
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 65128775,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}