Files
PureRL-1.5B-v7-s2-l2-maskon…/trainer_state.json
ModelHub XC 4a4dc18f82 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-l2-maskon-fixed
Source: Original Platform
2026-06-04 16:32:37 +08:00

12243 lines
503 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.773959219455719,
"adv/mean_abs_reasoning": 0.47714588046073914,
"adv/mean_abs_step_conf": 0.7489925622940063,
"adv/ratio_final_to_reasoning": 1.622059942565935,
"adv/ratio_step_to_reasoning": 1.5697349447317201,
"adv/std_final_conf": 0.9294352531433105,
"adv/std_reasoning": 0.7393431663513184,
"adv/std_step_conf": 0.9343287348747253,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.38076182006817844,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.2003187250996017,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": -0.026059730250481805,
"calib/mean_conf": 0.8737051792828686,
"calib/mu_c": 0.865606936416185,
"calib/mu_w": 0.8916666666666668,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19239043824701207,
"calib/std_conf": 0.09027744273295583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7959393232205367,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.006446568895645877,
"calib/step_q_w": 0.8023858921161826,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 474.94921875,
"completions/mean_terminated_length": 478.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.04304001107811928,
"kl": 0.000291675329208374,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0135,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03466901555657387,
"mask/share_reasoning": 0.8340686559677124,
"mask/share_step_conf": 0.12344987690448761,
"num_tokens": 229171.0,
"reward": 0.8933797478675842,
"reward_std": 0.19672280550003052,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7142800688743591,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7420106530189514,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.7672724723815918,
"adv/mean_abs_reasoning": 0.5104547739028931,
"adv/mean_abs_step_conf": 0.7698422074317932,
"adv/ratio_final_to_reasoning": 1.503115479781084,
"adv/ratio_step_to_reasoning": 1.5081496868873343,
"adv/std_final_conf": 0.9330522418022156,
"adv/std_reasoning": 0.7575037479400635,
"adv/std_step_conf": 0.9345327615737915,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.44343065693430656,
"calib/avg_num_step_conf": 5.05859375,
"calib/ece": 0.3349411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": 0.002352468143016151,
"calib/mean_conf": 0.8721960784313726,
"calib/mu_c": 0.8732846715328467,
"calib/mu_w": 0.8709322033898306,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3349411764705883,
"calib/std_conf": 0.07627016470309335,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7954391371340525,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.011011892552009073,
"calib/step_q_w": 0.7844272445820434,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 492.9765625,
"completions/mean_terminated_length": 494.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.04044223949313164,
"kl": 0.00037539005279541016,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0157,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03364308178424835,
"mask/share_reasoning": 0.8523939251899719,
"mask/share_step_conf": 0.11005672812461853,
"num_tokens": 458661.0,
"reward": 0.833743691444397,
"reward_std": 0.19285300374031067,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6320762038230896,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7291612029075623,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.7681164145469666,
"adv/mean_abs_reasoning": 0.480376660823822,
"adv/mean_abs_step_conf": 0.7542245388031006,
"adv/ratio_final_to_reasoning": 1.598987788519295,
"adv/ratio_step_to_reasoning": 1.5700690735258518,
"adv/std_final_conf": 0.9304441809654236,
"adv/std_reasoning": 0.7392795085906982,
"adv/std_step_conf": 0.9335688948631287,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.49758064516129036,
"calib/avg_num_step_conf": 4.91796875,
"calib/ece": 0.2540316205533596,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.3557312252964427,
"calib/gap": 0.001176075268817356,
"calib/mean_conf": 0.8864426877470355,
"calib/mu_c": 0.8868750000000001,
"calib/mu_w": 0.8856989247311827,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2540316205533596,
"calib/std_conf": 0.04630191430886356,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8032205683355886,
"calib/step_q_c_n": 739.0,
"calib/step_q_gap": 0.05218210679712709,
"calib/step_q_w": 0.7510384615384615,
"calib/step_q_w_n": 520.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2141.0,
"completions/max_terminated_length": 2141.0,
"completions/mean_length": 498.859375,
"completions/mean_terminated_length": 500.8157043457031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0032,
"grad_norm": 0.06902016699314117,
"kl": 0.0011424124240875244,
"learning_rate": 7.5e-07,
"loss": 0.0011,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03277520090341568,
"mask/share_reasoning": 0.8543053865432739,
"mask/share_step_conf": 0.10901317000389099,
"num_tokens": 691625.0,
"reward": 0.8797547817230225,
"reward_std": 0.19141316413879395,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6923027038574219,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7445505857467651,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.759086549282074,
"adv/mean_abs_reasoning": 0.3866489827632904,
"adv/mean_abs_step_conf": 0.7315422296524048,
"adv/ratio_final_to_reasoning": 1.9632446563212422,
"adv/ratio_step_to_reasoning": 1.8920060888929346,
"adv/std_final_conf": 0.9280747771263123,
"adv/std_reasoning": 0.6815574169158936,
"adv/std_step_conf": 0.9344615340232849,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5238012749445676,
"calib/avg_num_step_conf": 5.328125,
"calib/ece": 0.23059523809523813,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.28174603174603174,
"calib/gap": 0.006848669623060077,
"calib/mean_conf": 0.8813888888888889,
"calib/mu_c": 0.8837804878048782,
"calib/mu_w": 0.8769318181818181,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.23059523809523813,
"calib/std_conf": 0.04538258392998638,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.8031084337349397,
"calib/step_q_c_n": 830.0,
"calib/step_q_gap": 0.02587996931546399,
"calib/step_q_w": 0.7772284644194757,
"calib/step_q_w_n": 534.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2736.0,
"completions/max_terminated_length": 2736.0,
"completions/mean_length": 503.65234375,
"completions/mean_terminated_length": 505.6274719238281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.06372291594743729,
"kl": 0.00030165910720825195,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0234,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03290240466594696,
"mask/share_reasoning": 0.8445039987564087,
"mask/share_step_conf": 0.11868731677532196,
"num_tokens": 926728.0,
"reward": 0.8805626630783081,
"reward_std": 0.16167013347148895,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7082082033157349,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7286983132362366,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.7936871647834778,
"adv/mean_abs_reasoning": 0.4292134642601013,
"adv/mean_abs_step_conf": 0.7634122967720032,
"adv/ratio_final_to_reasoning": 1.8491665124058343,
"adv/ratio_step_to_reasoning": 1.7786308220503049,
"adv/std_final_conf": 0.9305525422096252,
"adv/std_reasoning": 0.6816219091415405,
"adv/std_step_conf": 0.9333208203315735,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.49735567732891545,
"calib/avg_num_step_conf": 4.73046875,
"calib/ece": 0.35171314741035864,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.2788844621513944,
"calib/gap": -3.12221231045795e-05,
"calib/mean_conf": 0.8815936254980079,
"calib/mu_c": 0.881578947368421,
"calib/mu_w": 0.8816101694915256,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.35171314741035864,
"calib/std_conf": 0.04364665025964348,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.797311669128508,
"calib/step_q_c_n": 677.0,
"calib/step_q_gap": 0.0173303957202684,
"calib/step_q_w": 0.7799812734082396,
"calib/step_q_w_n": 534.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2453.0,
"completions/max_terminated_length": 2453.0,
"completions/mean_length": 510.68359375,
"completions/mean_terminated_length": 512.686279296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.04050876572728157,
"kl": 0.00031438469886779785,
"learning_rate": 1.25e-06,
"loss": -0.0514,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03355187550187111,
"mask/share_reasoning": 0.8516452312469482,
"mask/share_step_conf": 0.11089661717414856,
"num_tokens": 1164151.0,
"reward": 0.795979380607605,
"reward_std": 0.16130538284778595,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6110925674438477,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.6824285984039307,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.7439464926719666,
"adv/mean_abs_reasoning": 0.3911336064338684,
"adv/mean_abs_step_conf": 0.7478975057601929,
"adv/ratio_final_to_reasoning": 1.9020265209498193,
"adv/ratio_step_to_reasoning": 1.9121279620513634,
"adv/std_final_conf": 0.9310183525085449,
"adv/std_reasoning": 0.7013688683509827,
"adv/std_step_conf": 0.9341275095939636,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4631035825545171,
"calib/avg_num_step_conf": 5.296875,
"calib/ece": 0.3064143426294821,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.2589641434262948,
"calib/gap": -0.003848650051921032,
"calib/mean_conf": 0.8784462151394421,
"calib/mu_c": 0.8768055555555555,
"calib/mu_w": 0.8806542056074765,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.30557768924302786,
"calib/std_conf": 0.039210029202454075,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7956066945606696,
"calib/step_q_c_n": 717.0,
"calib/step_q_gap": 0.00964425324611562,
"calib/step_q_w": 0.785962441314554,
"calib/step_q_w_n": 639.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2596.0,
"completions/max_terminated_length": 2596.0,
"completions/mean_length": 443.25390625,
"completions/mean_terminated_length": 446.74407958984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.0064,
"grad_norm": 0.04645095393061638,
"kl": 0.0004774928092956543,
"learning_rate": 1.5e-06,
"loss": -0.0451,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.0370139479637146,
"mask/share_reasoning": 0.8299905061721802,
"mask/share_step_conf": 0.12518304586410522,
"num_tokens": 1383576.0,
"reward": 0.8166599273681641,
"reward_std": 0.18238767981529236,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6414105892181396,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.6856592297554016,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.7877150774002075,
"adv/mean_abs_reasoning": 0.4653007388114929,
"adv/mean_abs_step_conf": 0.7472065091133118,
"adv/ratio_final_to_reasoning": 1.692916025476878,
"adv/ratio_step_to_reasoning": 1.6058571302119236,
"adv/std_final_conf": 0.9305210709571838,
"adv/std_reasoning": 0.7206236720085144,
"adv/std_step_conf": 0.9340626001358032,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5086842105263157,
"calib/avg_num_step_conf": 5.67578125,
"calib/ece": 0.26050980392156864,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3215686274509804,
"calib/gap": 0.0001052631578947194,
"calib/mean_conf": 0.8839607843137255,
"calib/mu_c": 0.884,
"calib/mu_w": 0.8838947368421053,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.25850980392156864,
"calib/std_conf": 0.04581718180338529,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7908867521367521,
"calib/step_q_c_n": 936.0,
"calib/step_q_gap": -6.1023491874534486e-05,
"calib/step_q_w": 0.7909477756286266,
"calib/step_q_w_n": 517.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1445.0,
"completions/max_terminated_length": 1445.0,
"completions/mean_length": 541.5078125,
"completions/mean_terminated_length": 543.6314086914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.05814244598150253,
"kl": 0.0003116726875305176,
"learning_rate": 1.75e-06,
"loss": 0.0009,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.030131680890917778,
"mask/share_reasoning": 0.8535774946212769,
"mask/share_step_conf": 0.11238458752632141,
"num_tokens": 1629626.0,
"reward": 0.8802685737609863,
"reward_std": 0.18626058101654053,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.695068359375,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7420310974121094,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.7732793092727661,
"adv/mean_abs_reasoning": 0.37526705861091614,
"adv/mean_abs_step_conf": 0.7797518372535706,
"adv/ratio_final_to_reasoning": 2.0606106811909553,
"adv/ratio_step_to_reasoning": 2.0778584726831344,
"adv/std_final_conf": 0.9287807941436768,
"adv/std_reasoning": 0.6402899026870728,
"adv/std_step_conf": 0.9342546463012695,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4633838383838384,
"calib/avg_num_step_conf": 4.734375,
"calib/ece": 0.3051181102362205,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.24015748031496062,
"calib/gap": 0.0007247474747473781,
"calib/mean_conf": 0.8720472440944882,
"calib/mu_c": 0.8723611111111111,
"calib/mu_w": 0.8716363636363638,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3051181102362205,
"calib/std_conf": 0.05314278171408132,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.794279210925645,
"calib/step_q_c_n": 659.0,
"calib/step_q_gap": 0.023139970419315925,
"calib/step_q_w": 0.7711392405063291,
"calib/step_q_w_n": 553.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1872.0,
"completions/max_terminated_length": 1872.0,
"completions/mean_length": 505.140625,
"completions/mean_terminated_length": 507.1216125488281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.03701084107160568,
"kl": 0.0010253936052322388,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0653,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.033262234181165695,
"mask/share_reasoning": 0.8576483726501465,
"mask/share_step_conf": 0.10518313944339752,
"num_tokens": 1865454.0,
"reward": 0.8495633602142334,
"reward_std": 0.16637495160102844,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6513617038726807,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7376086115837097,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.7699242234230042,
"adv/mean_abs_reasoning": 0.42989248037338257,
"adv/mean_abs_step_conf": 0.7657554745674133,
"adv/ratio_final_to_reasoning": 1.7909692738851526,
"adv/ratio_step_to_reasoning": 1.781272084364717,
"adv/std_final_conf": 0.9297853112220764,
"adv/std_reasoning": 0.7014699578285217,
"adv/std_step_conf": 0.934877336025238,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.45016307893020213,
"calib/avg_num_step_conf": 5.19140625,
"calib/ece": 0.30529880478087645,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.3147410358565737,
"calib/gap": -0.006487279843444216,
"calib/mean_conf": 0.8790836653386455,
"calib/mu_c": 0.8763698630136987,
"calib/mu_w": 0.8828571428571429,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.30135458167330675,
"calib/std_conf": 0.04754068264220872,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.7566895604395605,
"calib/step_q_c_n": 728.0,
"calib/step_q_gap": 0.022546465597630294,
"calib/step_q_w": 0.7341430948419302,
"calib/step_q_w_n": 601.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2879.0,
"completions/max_terminated_length": 2879.0,
"completions/mean_length": 505.69921875,
"completions/mean_terminated_length": 507.682373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.0096,
"grad_norm": 0.0427432544529438,
"kl": 0.00036010146141052246,
"learning_rate": 2.25e-06,
"loss": 0.1186,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034473203122615814,
"mask/share_reasoning": 0.8522090315818787,
"mask/share_step_conf": 0.10941154509782791,
"num_tokens": 2102449.0,
"reward": 0.8128198385238647,
"reward_std": 0.20088228583335876,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6406656503677368,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.6787240505218506,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.7671608924865723,
"adv/mean_abs_reasoning": 0.4218064248561859,
"adv/mean_abs_step_conf": 0.7739673256874084,
"adv/ratio_final_to_reasoning": 1.8187510840977217,
"adv/ratio_step_to_reasoning": 1.8348874746307886,
"adv/std_final_conf": 0.9295661449432373,
"adv/std_reasoning": 0.7012843489646912,
"adv/std_step_conf": 0.9343947172164917,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.49496732026143797,
"calib/avg_num_step_conf": 5.0546875,
"calib/ece": 0.2822529644268775,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.33201581027667987,
"calib/gap": -0.0011640522875817627,
"calib/mean_conf": 0.8869960474308299,
"calib/mu_c": 0.8865359477124184,
"calib/mu_w": 0.8877000000000002,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2822529644268775,
"calib/std_conf": 0.040798211425544656,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7859644322845417,
"calib/step_q_c_n": 731.0,
"calib/step_q_gap": -0.007844430948140357,
"calib/step_q_w": 0.7938088632326821,
"calib/step_q_w_n": 563.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3024.0,
"completions/max_terminated_length": 3024.0,
"completions/mean_length": 529.0859375,
"completions/mean_terminated_length": 529.0859375,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.04322522133588791,
"kl": 0.00040537118911743164,
"learning_rate": 2.5e-06,
"loss": 0.0367,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03210898116230965,
"mask/share_reasoning": 0.8572521209716797,
"mask/share_step_conf": 0.11063890159130096,
"num_tokens": 2344695.0,
"reward": 0.846677303314209,
"reward_std": 0.17992925643920898,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6672956943511963,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7096526622772217,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.7750564217567444,
"adv/mean_abs_reasoning": 0.4120665192604065,
"adv/mean_abs_step_conf": 0.7840834259986877,
"adv/ratio_final_to_reasoning": 1.8809012271801329,
"adv/ratio_step_to_reasoning": 1.902807894720474,
"adv/std_final_conf": 0.9275817275047302,
"adv/std_reasoning": 0.6816621422767639,
"adv/std_step_conf": 0.9336605668067932,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5005824111822947,
"calib/avg_num_step_conf": 5.3125,
"calib/ece": 0.2981102362204725,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.421259842519685,
"calib/gap": -0.007611467029055796,
"calib/mean_conf": 0.8851181102362206,
"calib/mu_c": 0.8820915032679739,
"calib/mu_w": 0.8897029702970297,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2904330708661418,
"calib/std_conf": 0.07794472240478827,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7586185819070905,
"calib/step_q_c_n": 818.0,
"calib/step_q_gap": -0.025366657945308124,
"calib/step_q_w": 0.7839852398523987,
"calib/step_q_w_n": 542.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2593.0,
"completions/max_terminated_length": 2593.0,
"completions/mean_length": 510.859375,
"completions/mean_terminated_length": 510.859375,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.048428092151880264,
"kl": 0.000606834888458252,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0033,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03325919806957245,
"mask/share_reasoning": 0.8499425649642944,
"mask/share_step_conf": 0.11679823696613312,
"num_tokens": 2579955.0,
"reward": 0.8530465364456177,
"reward_std": 0.16520269215106964,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6653339862823486,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7235714793205261,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.779029905796051,
"adv/mean_abs_reasoning": 0.4382178783416748,
"adv/mean_abs_step_conf": 0.7512285113334656,
"adv/ratio_final_to_reasoning": 1.777722781973418,
"adv/ratio_step_to_reasoning": 1.7142808371404212,
"adv/std_final_conf": 0.9251027703285217,
"adv/std_reasoning": 0.7014181017875671,
"adv/std_step_conf": 0.9336206912994385,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5273951400711965,
"calib/avg_num_step_conf": 5.3046875,
"calib/ece": 0.18339920948616595,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4150197628458498,
"calib/gap": 0.00214517876489706,
"calib/mean_conf": 0.8947826086956522,
"calib/mu_c": 0.8953846153846152,
"calib/mu_w": 0.8932394366197182,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17940711462450587,
"calib/std_conf": 0.045688109327331215,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.776427818756586,
"calib/step_q_c_n": 949.0,
"calib/step_q_gap": 0.0118067918617204,
"calib/step_q_w": 0.7646210268948656,
"calib/step_q_w_n": 409.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2545.0,
"completions/max_terminated_length": 2545.0,
"completions/mean_length": 483.40625,
"completions/mean_terminated_length": 483.40625,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.0128,
"grad_norm": 0.05811633542180061,
"kl": 0.0011870861053466797,
"learning_rate": 3e-06,
"loss": -0.0039,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03670063614845276,
"mask/share_reasoning": 0.8373144865036011,
"mask/share_step_conf": 0.12598487734794617,
"num_tokens": 2807883.0,
"reward": 0.9387664794921875,
"reward_std": 0.17711131274700165,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.7571523189544678,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7805367708206177,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.7828412055969238,
"adv/mean_abs_reasoning": 0.46385371685028076,
"adv/mean_abs_step_conf": 0.7666856646537781,
"adv/ratio_final_to_reasoning": 1.687689840910779,
"adv/ratio_step_to_reasoning": 1.6528608843749832,
"adv/std_final_conf": 0.9281428456306458,
"adv/std_reasoning": 0.7205371856689453,
"adv/std_step_conf": 0.9340802431106567,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5346534653465347,
"calib/avg_num_step_conf": 4.71875,
"calib/ece": 0.291171875,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.013747684445863939,
"calib/mean_conf": 0.8966406249999999,
"calib/mu_c": 0.9020645161290324,
"calib/mu_w": 0.8883168316831684,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.291171875,
"calib/std_conf": 0.05633711786743599,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7597331460674157,
"calib/step_q_c_n": 712.0,
"calib/step_q_gap": 0.009954920260964117,
"calib/step_q_w": 0.7497782258064516,
"calib/step_q_w_n": 496.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1816.0,
"completions/max_terminated_length": 1816.0,
"completions/mean_length": 467.59375,
"completions/mean_terminated_length": 469.427490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.045925572514534,
"kl": 0.0027227401733398438,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0038,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.035495657473802567,
"mask/share_reasoning": 0.8489422798156738,
"mask/share_step_conf": 0.11165584623813629,
"num_tokens": 3032179.0,
"reward": 0.8815797567367554,
"reward_std": 0.1894691288471222,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.6747804880142212,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7688478231430054,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.7548052072525024,
"adv/mean_abs_reasoning": 0.5050555467605591,
"adv/mean_abs_step_conf": 0.7736717462539673,
"adv/ratio_final_to_reasoning": 1.4944993913913924,
"adv/ratio_step_to_reasoning": 1.5318547657110595,
"adv/std_final_conf": 0.9282140731811523,
"adv/std_reasoning": 0.7575881481170654,
"adv/std_step_conf": 0.9345173835754395,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.44230898011116326,
"calib/avg_num_step_conf": 5.859375,
"calib/ece": 0.3535365853658536,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.5853658536585366,
"calib/gap": -0.007598607111765965,
"calib/mean_conf": 0.9104471544715447,
"calib/mu_c": 0.9070802919708029,
"calib/mu_w": 0.9146788990825688,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3535365853658536,
"calib/std_conf": 0.03879541990653156,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.753515923566879,
"calib/step_q_c_n": 785.0,
"calib/step_q_gap": 0.020145294196249663,
"calib/step_q_w": 0.7333706293706294,
"calib/step_q_w_n": 715.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3013.0,
"completions/max_terminated_length": 3013.0,
"completions/mean_length": 567.05078125,
"completions/mean_terminated_length": 569.2745361328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.039881713688373566,
"kl": 0.003141164779663086,
"learning_rate": 3.5e-06,
"loss": 0.0357,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.031403157860040665,
"mask/share_reasoning": 0.8467769622802734,
"mask/share_step_conf": 0.1179136335849762,
"num_tokens": 3282744.0,
"reward": 0.8136132955551147,
"reward_std": 0.19758296012878418,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.5947633385658264,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.7340257167816162,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.7607070207595825,
"adv/mean_abs_reasoning": 0.31956201791763306,
"adv/mean_abs_step_conf": 0.7712626457214355,
"adv/ratio_final_to_reasoning": 2.3804675715737105,
"adv/ratio_step_to_reasoning": 2.4134991096477183,
"adv/std_final_conf": 0.9202884435653687,
"adv/std_reasoning": 0.5960820317268372,
"adv/std_step_conf": 0.9338883757591248,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.49731732243229426,
"calib/avg_num_step_conf": 5.02734375,
"calib/ece": 0.32792156862745103,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.7098039215686275,
"calib/gap": -0.0028570516096064758,
"calib/mean_conf": 0.9185882352941177,
"calib/mu_c": 0.9174342105263159,
"calib/mu_w": 0.9202912621359224,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32521568627450986,
"calib/std_conf": 0.051032160744173334,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.719733688415446,
"calib/step_q_c_n": 751.0,
"calib/step_q_gap": -0.007131983226345007,
"calib/step_q_w": 0.7268656716417911,
"calib/step_q_w_n": 536.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2654.0,
"completions/max_terminated_length": 2654.0,
"completions/mean_length": 461.67578125,
"completions/mean_terminated_length": 461.67578125,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.016,
"grad_norm": 0.038044411689043045,
"kl": 0.006200313568115234,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0106,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.035136036574840546,
"mask/share_reasoning": 0.846064567565918,
"mask/share_step_conf": 0.1187993660569191,
"num_tokens": 3508813.0,
"reward": 0.8648393154144287,
"reward_std": 0.14193323254585266,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6486945152282715,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7630153894424438,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.7559776306152344,
"adv/mean_abs_reasoning": 0.42459970712661743,
"adv/mean_abs_step_conf": 0.7681550979614258,
"adv/ratio_final_to_reasoning": 1.7804478381088442,
"adv/ratio_step_to_reasoning": 1.8091277150418728,
"adv/std_final_conf": 0.9152006506919861,
"adv/std_reasoning": 0.6817044019699097,
"adv/std_step_conf": 0.9341446161270142,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5220464725643897,
"calib/avg_num_step_conf": 6.6015625,
"calib/ece": 0.31845528455284555,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.8577235772357723,
"calib/gap": 0.013194288913773833,
"calib/mean_conf": 0.9321951219512195,
"calib/mu_c": 0.937236842105263,
"calib/mu_w": 0.9240425531914892,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.31638211382113823,
"calib/std_conf": 0.0715625439722616,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6638545816733068,
"calib/step_q_c_n": 1004.0,
"calib/step_q_gap": 0.06821318225639716,
"calib/step_q_w": 0.5956413994169096,
"calib/step_q_w_n": 686.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3071.0,
"completions/max_terminated_length": 3071.0,
"completions/mean_length": 673.078125,
"completions/mean_terminated_length": 675.7177124023438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.036168936640024185,
"kl": 0.006927013397216797,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0749,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.025286775082349777,
"mask/share_reasoning": 0.8620797395706177,
"mask/share_step_conf": 0.10872718691825867,
"num_tokens": 3789969.0,
"reward": 0.8630998134613037,
"reward_std": 0.18709713220596313,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6401921510696411,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7750698328018188,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.7265196442604065,
"adv/mean_abs_reasoning": 0.5270153284072876,
"adv/mean_abs_step_conf": 0.7634360790252686,
"adv/ratio_final_to_reasoning": 1.3785550535238666,
"adv/ratio_step_to_reasoning": 1.4486031769369532,
"adv/std_final_conf": 0.922140896320343,
"adv/std_reasoning": 0.7927994132041931,
"adv/std_step_conf": 0.9348483681678772,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5486486486486486,
"calib/avg_num_step_conf": 5.625,
"calib/ece": 0.19995983935742973,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.8714859437751004,
"calib/gap": 0.006579391891891939,
"calib/mean_conf": 0.9402008032128514,
"calib/mu_c": 0.9418918918918919,
"calib/mu_w": 0.9353125,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.19859437751004017,
"calib/std_conf": 0.04102550679991887,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.6443323727185398,
"calib/step_q_c_n": 1041.0,
"calib/step_q_gap": 0.01593638274360243,
"calib/step_q_w": 0.6283959899749374,
"calib/step_q_w_n": 399.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2665.0,
"completions/max_terminated_length": 2665.0,
"completions/mean_length": 530.74609375,
"completions/mean_terminated_length": 537.03955078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.03977389633655548,
"kl": 0.01251983642578125,
"learning_rate": 4.25e-06,
"loss": 0.013,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03260646015405655,
"mask/share_reasoning": 0.8350520133972168,
"mask/share_step_conf": 0.12062276899814606,
"num_tokens": 4029368.0,
"reward": 0.945256233215332,
"reward_std": 0.23300540447235107,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.7420820593833923,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8109303712844849,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.7605211734771729,
"adv/mean_abs_reasoning": 0.3960456848144531,
"adv/mean_abs_step_conf": 0.7509171962738037,
"adv/ratio_final_to_reasoning": 1.9202864786507547,
"adv/ratio_step_to_reasoning": 1.8960368085454773,
"adv/std_final_conf": 0.9212812781333923,
"adv/std_reasoning": 0.6815840005874634,
"adv/std_step_conf": 0.9348611831665039,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4494174571846947,
"calib/avg_num_step_conf": 5.0546875,
"calib/ece": 0.3976984126984128,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9007936507936508,
"calib/gap": -0.0002954096899471237,
"calib/mean_conf": 0.946031746031746,
"calib/mu_c": 0.9458992805755397,
"calib/mu_w": 0.9461946902654869,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.39607142857142863,
"calib/std_conf": 0.05627426270492683,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6379673590504451,
"calib/step_q_c_n": 674.0,
"calib/step_q_gap": 0.02185445582463863,
"calib/step_q_w": 0.6161129032258065,
"calib/step_q_w_n": 620.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2841.0,
"completions/max_terminated_length": 2841.0,
"completions/mean_length": 526.40234375,
"completions/mean_terminated_length": 526.40234375,
"completions/min_length": 198.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.0192,
"grad_norm": 0.03876996040344238,
"kl": 0.013779640197753906,
"learning_rate": 4.5e-06,
"loss": 0.0361,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.032325707376003265,
"mask/share_reasoning": 0.8605347871780396,
"mask/share_step_conf": 0.10713949799537659,
"num_tokens": 4274847.0,
"reward": 0.8270844221115112,
"reward_std": 0.1768120676279068,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5842535495758057,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7652277946472168,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.7454897165298462,
"adv/mean_abs_reasoning": 0.4756406545639038,
"adv/mean_abs_step_conf": 0.7751795053482056,
"adv/ratio_final_to_reasoning": 1.5673380931101366,
"adv/ratio_step_to_reasoning": 1.6297587220733625,
"adv/std_final_conf": 0.9180338382720947,
"adv/std_reasoning": 0.7392441630363464,
"adv/std_step_conf": 0.9347119331359863,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5674637862137862,
"calib/avg_num_step_conf": 4.85546875,
"calib/ece": 0.3841960784313725,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9372549019607843,
"calib/gap": 0.030375249750249633,
"calib/mean_conf": 0.9449803921568627,
"calib/mu_c": 0.9583216783216782,
"calib/mu_w": 0.9279464285714286,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3841960784313725,
"calib/std_conf": 0.10404325838579108,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6270876671619613,
"calib/step_q_c_n": 673.0,
"calib/step_q_gap": 0.046859596986522645,
"calib/step_q_w": 0.5802280701754386,
"calib/step_q_w_n": 570.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1751.0,
"completions/max_terminated_length": 1751.0,
"completions/mean_length": 479.44921875,
"completions/mean_terminated_length": 481.3294372558594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 213.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.02486286498606205,
"kl": 0.019275665283203125,
"learning_rate": 4.75e-06,
"loss": 0.0117,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03219831734895706,
"mask/share_reasoning": 0.8544199466705322,
"mask/share_step_conf": 0.10947546362876892,
"num_tokens": 4502346.0,
"reward": 0.8657012581825256,
"reward_std": 0.19842995703220367,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6078425645828247,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8126223087310791,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.75970858335495,
"adv/mean_abs_reasoning": 0.42294591665267944,
"adv/mean_abs_step_conf": 0.7696812152862549,
"adv/ratio_final_to_reasoning": 1.7962310391066334,
"adv/ratio_step_to_reasoning": 1.8198100158472799,
"adv/std_final_conf": 0.9097000956535339,
"adv/std_reasoning": 0.681745171546936,
"adv/std_step_conf": 0.9345875382423401,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.3604950495049505,
"calib/avg_num_step_conf": 5.48828125,
"calib/ece": 0.36517928286852586,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9641434262948207,
"calib/gap": -0.00992013201320141,
"calib/mean_conf": 0.9587250996015938,
"calib/mu_c": 0.9547333333333332,
"calib/mu_w": 0.9646534653465346,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36314741035856574,
"calib/std_conf": 0.03790279199381405,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5661548556430447,
"calib/step_q_c_n": 762.0,
"calib/step_q_gap": 0.004475228893433547,
"calib/step_q_w": 0.5616796267496111,
"calib/step_q_w_n": 643.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2742.0,
"completions/max_terminated_length": 2742.0,
"completions/mean_length": 490.6875,
"completions/mean_terminated_length": 492.6117858886719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.030515167862176895,
"kl": 0.024442672729492188,
"learning_rate": 5e-06,
"loss": 0.04,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034816332161426544,
"mask/share_reasoning": 0.833861231803894,
"mask/share_step_conf": 0.1274162083864212,
"num_tokens": 4732834.0,
"reward": 0.8644837141036987,
"reward_std": 0.1930766999721527,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6107491850852966,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.804936945438385,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.7336472272872925,
"adv/mean_abs_reasoning": 0.5508327484130859,
"adv/mean_abs_step_conf": 0.7745312452316284,
"adv/ratio_final_to_reasoning": 1.3318874547689536,
"adv/ratio_step_to_reasoning": 1.4061096539067504,
"adv/std_final_conf": 0.9031286835670471,
"adv/std_reasoning": 0.7927908301353455,
"adv/std_step_conf": 0.9346875548362732,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.539980449657869,
"calib/avg_num_step_conf": 5.640625,
"calib/ece": 0.3595669291338581,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.984251968503937,
"calib/gap": 0.002987943955685801,
"calib/mean_conf": 0.9698031496062991,
"calib/mu_c": 0.9709677419354839,
"calib/mu_w": 0.967979797979798,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3595669291338581,
"calib/std_conf": 0.022438907215127788,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5120794392523365,
"calib/step_q_c_n": 856.0,
"calib/step_q_gap": 0.017419575306758195,
"calib/step_q_w": 0.49465986394557826,
"calib/step_q_w_n": 588.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2372.0,
"completions/max_terminated_length": 2372.0,
"completions/mean_length": 503.39453125,
"completions/mean_terminated_length": 505.36865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.0224,
"grad_norm": 0.025241529569029808,
"kl": 0.02947998046875,
"learning_rate": 4.9722222222222224e-06,
"loss": -0.0444,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03347921371459961,
"mask/share_reasoning": 0.8384062647819519,
"mask/share_step_conf": 0.12420830875635147,
"num_tokens": 4964663.0,
"reward": 0.8812829852104187,
"reward_std": 0.22070921957492828,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.6249253749847412,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8188904523849487,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.7191415429115295,
"adv/mean_abs_reasoning": 0.3771659731864929,
"adv/mean_abs_step_conf": 0.7461881637573242,
"adv/ratio_final_to_reasoning": 1.9066978307609523,
"adv/ratio_step_to_reasoning": 1.978407960434875,
"adv/std_final_conf": 0.8773278594017029,
"adv/std_reasoning": 0.661241352558136,
"adv/std_step_conf": 0.934755265712738,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5144485144485145,
"calib/avg_num_step_conf": 5.9296875,
"calib/ece": 0.3338735177865613,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9802371541501976,
"calib/gap": -0.00047822547822551353,
"calib/mean_conf": 0.9733201581027668,
"calib/mu_c": 0.9731481481481482,
"calib/mu_w": 0.9736263736263737,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.33343873517786565,
"calib/std_conf": 0.018545006380022704,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.49158168574401667,
"calib/step_q_c_n": 961.0,
"calib/step_q_gap": 0.01014542003486052,
"calib/step_q_w": 0.48143626570915615,
"calib/step_q_w_n": 557.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2713.0,
"completions/max_terminated_length": 2713.0,
"completions/mean_length": 489.57421875,
"completions/mean_terminated_length": 491.494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.029216783121228218,
"kl": 0.0421905517578125,
"learning_rate": 4.944444444444445e-06,
"loss": -0.0155,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03287845849990845,
"mask/share_reasoning": 0.8300130367279053,
"mask/share_step_conf": 0.1332021951675415,
"num_tokens": 5191810.0,
"reward": 0.894919753074646,
"reward_std": 0.1613466739654541,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6505191326141357,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8151016235351562,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.747708797454834,
"adv/mean_abs_reasoning": 0.5173189640045166,
"adv/mean_abs_step_conf": 0.7510793209075928,
"adv/ratio_final_to_reasoning": 1.445353542941654,
"adv/ratio_step_to_reasoning": 1.4518689109975007,
"adv/std_final_conf": 0.8959153890609741,
"adv/std_reasoning": 0.7575620412826538,
"adv/std_step_conf": 0.9347652792930603,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4732358550540369,
"calib/avg_num_step_conf": 5.79296875,
"calib/ece": 0.4126482213438735,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9960474308300395,
"calib/gap": -0.002314685314685483,
"calib/mean_conf": 0.9766007905138341,
"calib/mu_c": 0.9755944055944054,
"calib/mu_w": 0.9779090909090908,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.41201581027667983,
"calib/std_conf": 0.01956813169495786,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.48082932692307695,
"calib/step_q_c_n": 832.0,
"calib/step_q_gap": -0.006620749881838528,
"calib/step_q_w": 0.4874500768049155,
"calib/step_q_w_n": 651.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1693.0,
"completions/max_terminated_length": 1693.0,
"completions/mean_length": 517.4921875,
"completions/mean_terminated_length": 517.4921875,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.03661978244781494,
"kl": 0.046306610107421875,
"learning_rate": 4.9166666666666665e-06,
"loss": 0.0112,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.034062646329402924,
"mask/share_reasoning": 0.8369054794311523,
"mask/share_step_conf": 0.12903186678886414,
"num_tokens": 5428224.0,
"reward": 0.8458771109580994,
"reward_std": 0.20304051041603088,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.5766586065292358,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8049393892288208,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.7730128169059753,
"adv/mean_abs_reasoning": 0.614250123500824,
"adv/mean_abs_step_conf": 0.7427619695663452,
"adv/ratio_final_to_reasoning": 1.25846587136248,
"adv/ratio_step_to_reasoning": 1.2092174525469979,
"adv/std_final_conf": 0.9102962613105774,
"adv/std_reasoning": 0.8267250657081604,
"adv/std_step_conf": 0.9349828958511353,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5206280991735537,
"calib/avg_num_step_conf": 6.4921875,
"calib/ece": 0.486869918699187,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0016667768595042265,
"calib/mean_conf": 0.978739837398374,
"calib/mu_c": 0.9795867768595042,
"calib/mu_w": 0.97792,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.486869918699187,
"calib/std_conf": 0.01430068359181033,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4792125,
"calib/step_q_c_n": 800.0,
"calib/step_q_gap": 0.048354031322505764,
"calib/step_q_w": 0.4308584686774942,
"calib/step_q_w_n": 862.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2509.0,
"completions/max_terminated_length": 2509.0,
"completions/mean_length": 578.37890625,
"completions/mean_terminated_length": 585.2371826171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.0256,
"grad_norm": 0.025873012840747833,
"kl": 0.036952972412109375,
"learning_rate": 4.888888888888889e-06,
"loss": -0.0523,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03086896426975727,
"mask/share_reasoning": 0.8307771682739258,
"mask/share_step_conf": 0.1266351342201233,
"num_tokens": 5680801.0,
"reward": 0.7857183218002319,
"reward_std": 0.24644377827644348,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.4935879111289978,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7911299467086792,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.6677876710891724,
"adv/mean_abs_reasoning": 0.44983309507369995,
"adv/mean_abs_step_conf": 0.7818006277084351,
"adv/ratio_final_to_reasoning": 1.4845232118365215,
"adv/ratio_step_to_reasoning": 1.7379793444951979,
"adv/std_final_conf": 0.854354739189148,
"adv/std_reasoning": 0.7204803228378296,
"adv/std_step_conf": 0.9348347187042236,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5566984126984127,
"calib/avg_num_step_conf": 6.09765625,
"calib/ece": 0.3904705882352941,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0023333333333330764,
"calib/mean_conf": 0.9787058823529412,
"calib/mu_c": 0.9796666666666666,
"calib/mu_w": 0.9773333333333335,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3904705882352941,
"calib/std_conf": 0.014907945134827336,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4798648648648648,
"calib/step_q_c_n": 888.0,
"calib/step_q_gap": 0.034456246737078755,
"calib/step_q_w": 0.44540861812778604,
"calib/step_q_w_n": 673.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2071.0,
"completions/max_terminated_length": 2071.0,
"completions/mean_length": 486.875,
"completions/mean_terminated_length": 486.875,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.03276247903704643,
"kl": 0.05126953125,
"learning_rate": 4.861111111111111e-06,
"loss": 0.0223,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03289720416069031,
"mask/share_reasoning": 0.8330222964286804,
"mask/share_step_conf": 0.13408046960830688,
"num_tokens": 5908665.0,
"reward": 0.8634007573127747,
"reward_std": 0.17636817693710327,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.603858232498169,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8065370321273804,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.7102105617523193,
"adv/mean_abs_reasoning": 0.4211689233779907,
"adv/mean_abs_step_conf": 0.7700457572937012,
"adv/ratio_final_to_reasoning": 1.6862843441915574,
"adv/ratio_step_to_reasoning": 1.8283536950388917,
"adv/std_final_conf": 0.867534339427948,
"adv/std_reasoning": 0.6815800666809082,
"adv/std_step_conf": 0.9347423911094666,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5649297201468912,
"calib/avg_num_step_conf": 5.55859375,
"calib/ece": 0.39337254901960783,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.996078431372549,
"calib/gap": 0.00427440800303891,
"calib/mean_conf": 0.9776862745098039,
"calib/mu_c": 0.9794630872483221,
"calib/mu_w": 0.9751886792452832,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39337254901960783,
"calib/std_conf": 0.015254045038687006,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.507744966442953,
"calib/step_q_c_n": 745.0,
"calib/step_q_gap": 0.035901308625843786,
"calib/step_q_w": 0.4718436578171092,
"calib/step_q_w_n": 678.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2112.0,
"completions/max_terminated_length": 2112.0,
"completions/mean_length": 490.828125,
"completions/mean_terminated_length": 492.7529602050781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 196.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.06875115633010864,
"kl": 0.29926300048828125,
"learning_rate": 4.833333333333333e-06,
"loss": 0.0166,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.030738497152924538,
"mask/share_reasoning": 0.8481480479240417,
"mask/share_step_conf": 0.11720723658800125,
"num_tokens": 6139557.0,
"reward": 0.8700515031814575,
"reward_std": 0.16770586371421814,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6018503904342651,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8226275444030762,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.7465525269508362,
"adv/mean_abs_reasoning": 0.4980428218841553,
"adv/mean_abs_step_conf": 0.7682787179946899,
"adv/ratio_final_to_reasoning": 1.4989725665085165,
"adv/ratio_step_to_reasoning": 1.5425957051006178,
"adv/std_final_conf": 0.8857267498970032,
"adv/std_reasoning": 0.7393344640731812,
"adv/std_step_conf": 0.9349102973937988,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5512772351615327,
"calib/avg_num_step_conf": 6.40234375,
"calib/ece": 0.45450592885375496,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9960474308300395,
"calib/gap": 0.0043663911845729375,
"calib/mean_conf": 0.9762450592885376,
"calib/mu_c": 0.9783333333333335,
"calib/mu_w": 0.9739669421487606,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.45450592885375496,
"calib/std_conf": 0.016978573607796924,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4405236270753512,
"calib/step_q_c_n": 783.0,
"calib/step_q_gap": 0.0022993280099306568,
"calib/step_q_w": 0.43822429906542054,
"calib/step_q_w_n": 856.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2318.0,
"completions/max_terminated_length": 2318.0,
"completions/mean_length": 491.70703125,
"completions/mean_terminated_length": 495.5787353515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.0288,
"grad_norm": 0.13274186849594116,
"kl": 0.061382293701171875,
"learning_rate": 4.805555555555556e-06,
"loss": -0.0484,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0327107235789299,
"mask/share_reasoning": 0.8278404474258423,
"mask/share_step_conf": 0.13163632154464722,
"num_tokens": 6370650.0,
"reward": 0.8293382525444031,
"reward_std": 0.19650721549987793,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.5393917560577393,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8185034990310669,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.6987853050231934,
"adv/mean_abs_reasoning": 0.4233437180519104,
"adv/mean_abs_step_conf": 0.7611550688743591,
"adv/ratio_final_to_reasoning": 1.6506334574628276,
"adv/ratio_step_to_reasoning": 1.7979599942499356,
"adv/std_final_conf": 0.8700273633003235,
"adv/std_reasoning": 0.7013515830039978,
"adv/std_step_conf": 0.934949517250061,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5388085399449037,
"calib/avg_num_step_conf": 5.5546875,
"calib/ece": 0.3256521739130434,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9881422924901185,
"calib/gap": -0.0005681818181820786,
"calib/mean_conf": 0.9747430830039526,
"calib/mu_c": 0.9745454545454543,
"calib/mu_w": 0.9751136363636363,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.32411067193675885,
"calib/std_conf": 0.02992139242886999,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4748793859649123,
"calib/step_q_c_n": 912.0,
"calib/step_q_gap": 0.03242840557275545,
"calib/step_q_w": 0.44245098039215686,
"calib/step_q_w_n": 510.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2281.0,
"completions/max_terminated_length": 2281.0,
"completions/mean_length": 541.046875,
"completions/mean_terminated_length": 541.046875,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.026259498670697212,
"kl": 0.040096282958984375,
"learning_rate": 4.777777777777778e-06,
"loss": -0.0154,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03135649859905243,
"mask/share_reasoning": 0.8519505858421326,
"mask/share_step_conf": 0.11669294536113739,
"num_tokens": 6616102.0,
"reward": 0.9012293219566345,
"reward_std": 0.18706245720386505,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.656219482421875,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8204576969146729,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.7530903816223145,
"adv/mean_abs_reasoning": 0.5109613537788391,
"adv/mean_abs_step_conf": 0.7395081520080566,
"adv/ratio_final_to_reasoning": 1.473869552076294,
"adv/ratio_step_to_reasoning": 1.4472878360349344,
"adv/std_final_conf": 0.9067643284797668,
"adv/std_reasoning": 0.7575812339782715,
"adv/std_step_conf": 0.9346757531166077,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5175736961451247,
"calib/avg_num_step_conf": 6.69921875,
"calib/ece": 0.47662698412698423,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.996031746031746,
"calib/gap": -0.001825396825396619,
"calib/mean_conf": 0.9726587301587301,
"calib/mu_c": 0.9717460317460319,
"calib/mu_w": 0.9735714285714285,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.47464285714285726,
"calib/std_conf": 0.03489887038204902,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.45238341968911916,
"calib/step_q_c_n": 772.0,
"calib/step_q_gap": 0.05528904004967056,
"calib/step_q_w": 0.3970943796394486,
"calib/step_q_w_n": 943.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2779.0,
"completions/max_terminated_length": 2779.0,
"completions/mean_length": 581.3125,
"completions/mean_terminated_length": 581.3125,
"completions/min_length": 183.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.02828669734299183,
"kl": 0.03983306884765625,
"learning_rate": 4.75e-06,
"loss": -0.0191,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.02909451350569725,
"mask/share_reasoning": 0.8438331484794617,
"mask/share_step_conf": 0.12707233428955078,
"num_tokens": 6872046.0,
"reward": 0.8204588890075684,
"reward_std": 0.19906474649906158,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.5123624801635742,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8340240716934204,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.7167383432388306,
"adv/mean_abs_reasoning": 0.5228696465492249,
"adv/mean_abs_step_conf": 0.738788366317749,
"adv/ratio_final_to_reasoning": 1.3707782579636782,
"adv/ratio_step_to_reasoning": 1.4129494247629784,
"adv/std_final_conf": 0.9092122912406921,
"adv/std_reasoning": 0.7927682995796204,
"adv/std_step_conf": 0.9349443316459656,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5244153600850386,
"calib/avg_num_step_conf": 6.23828125,
"calib/ece": 0.40125,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9879032258064516,
"calib/gap": 0.0037244220037203624,
"calib/mean_conf": 0.9738306451612904,
"calib/mu_c": 0.9754225352112675,
"calib/mu_w": 0.9716981132075472,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.40125,
"calib/std_conf": 0.02089344000081456,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.42900846432889966,
"calib/step_q_c_n": 827.0,
"calib/step_q_gap": 0.0022292435496789054,
"calib/step_q_w": 0.42677922077922076,
"calib/step_q_w_n": 770.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2605.0,
"completions/max_terminated_length": 2605.0,
"completions/mean_length": 588.99609375,
"completions/mean_terminated_length": 593.6338500976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.032,
"grad_norm": 0.02038068138062954,
"kl": 0.049793243408203125,
"learning_rate": 4.722222222222222e-06,
"loss": -0.0398,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.028470445424318314,
"mask/share_reasoning": 0.844284176826477,
"mask/share_step_conf": 0.11943288147449493,
"num_tokens": 7129813.0,
"reward": 0.8432120084762573,
"reward_std": 0.2119450569152832,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5770386457443237,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8046977519989014,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.7576237916946411,
"adv/mean_abs_reasoning": 0.43475398421287537,
"adv/mean_abs_step_conf": 0.7570363283157349,
"adv/ratio_final_to_reasoning": 1.7426494505077934,
"adv/ratio_step_to_reasoning": 1.741298195774683,
"adv/std_final_conf": 0.9100903272628784,
"adv/std_reasoning": 0.7014799118041992,
"adv/std_step_conf": 0.9344565868377686,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5342292089249493,
"calib/avg_num_step_conf": 7.34765625,
"calib/ece": 0.5067063492063492,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9722222222222222,
"calib/gap": 0.010945740365111867,
"calib/mean_conf": 0.9670238095238095,
"calib/mu_c": 0.9729310344827589,
"calib/mu_w": 0.961985294117647,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.5067063492063492,
"calib/std_conf": 0.06575169311426353,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4390456431535269,
"calib/step_q_c_n": 723.0,
"calib/step_q_gap": 0.0510145550706253,
"calib/step_q_w": 0.3880310880829016,
"calib/step_q_w_n": 1158.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2400.0,
"completions/max_terminated_length": 2400.0,
"completions/mean_length": 599.3359375,
"completions/mean_terminated_length": 601.6863403320312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.023156926035881042,
"kl": 0.037746429443359375,
"learning_rate": 4.694444444444445e-06,
"loss": -0.0181,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.02954915352165699,
"mask/share_reasoning": 0.840360701084137,
"mask/share_step_conf": 0.12618383765220642,
"num_tokens": 7389155.0,
"reward": 0.8060630559921265,
"reward_std": 0.19124513864517212,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.4881894588470459,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8364366292953491,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.763539731502533,
"adv/mean_abs_reasoning": 0.502423882484436,
"adv/mean_abs_step_conf": 0.767042875289917,
"adv/ratio_final_to_reasoning": 1.519712255171679,
"adv/ratio_step_to_reasoning": 1.526684741770169,
"adv/std_final_conf": 0.9136727452278137,
"adv/std_reasoning": 0.7576688528060913,
"adv/std_step_conf": 0.93479984998703,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5844725950081525,
"calib/avg_num_step_conf": 6.0390625,
"calib/ece": 0.4245849802371543,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.924901185770751,
"calib/gap": 0.02855512354195411,
"calib/mean_conf": 0.9440316205533598,
"calib/mu_c": 0.9574626865671643,
"calib/mu_w": 0.9289075630252102,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4194861660079053,
"calib/std_conf": 0.12681391781318305,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4357412398921833,
"calib/step_q_c_n": 742.0,
"calib/step_q_gap": 0.05600243392203397,
"calib/step_q_w": 0.3797388059701493,
"calib/step_q_w_n": 804.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2402.0,
"completions/max_terminated_length": 2402.0,
"completions/mean_length": 504.4375,
"completions/mean_terminated_length": 508.4094543457031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.02298598177731037,
"kl": 0.048114776611328125,
"learning_rate": 4.666666666666667e-06,
"loss": -0.044,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03240440785884857,
"mask/share_reasoning": 0.830779492855072,
"mask/share_step_conf": 0.1290036290884018,
"num_tokens": 7624995.0,
"reward": 0.8541609048843384,
"reward_std": 0.20546889305114746,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.5703113079071045,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8364480137825012,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.730973482131958,
"adv/mean_abs_reasoning": 0.43391260504722595,
"adv/mean_abs_step_conf": 0.7419459223747253,
"adv/ratio_final_to_reasoning": 1.684609927504643,
"adv/ratio_step_to_reasoning": 1.7098971399873801,
"adv/std_final_conf": 0.894087553024292,
"adv/std_reasoning": 0.701427698135376,
"adv/std_step_conf": 0.9345289468765259,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5539067422810334,
"calib/avg_num_step_conf": 6.52734375,
"calib/ece": 0.40873517786561275,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9486166007905138,
"calib/gap": 0.03398550724637672,
"calib/mean_conf": 0.9541897233201582,
"calib/mu_c": 0.9696376811594203,
"calib/mu_w": 0.9356521739130436,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.40873517786561275,
"calib/std_conf": 0.10689211607677195,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.42857929515418497,
"calib/step_q_c_n": 908.0,
"calib/step_q_gap": 0.07017824666139333,
"calib/step_q_w": 0.35840104849279164,
"calib/step_q_w_n": 763.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2725.0,
"completions/max_terminated_length": 2725.0,
"completions/mean_length": 537.07421875,
"completions/mean_terminated_length": 541.3031616210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.0352,
"grad_norm": 0.019974276423454285,
"kl": 0.048274993896484375,
"learning_rate": 4.638888888888889e-06,
"loss": -0.0733,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03093218244612217,
"mask/share_reasoning": 0.8321001529693604,
"mask/share_step_conf": 0.12915518879890442,
"num_tokens": 7869358.0,
"reward": 0.8696585893630981,
"reward_std": 0.18201705813407898,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.5835089683532715,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8503393530845642,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.7161438465118408,
"adv/mean_abs_reasoning": 0.5695993304252625,
"adv/mean_abs_step_conf": 0.7828904390335083,
"adv/ratio_final_to_reasoning": 1.2572764893827533,
"adv/ratio_step_to_reasoning": 1.374458144901615,
"adv/std_final_conf": 0.9155739545822144,
"adv/std_reasoning": 0.8264137506484985,
"adv/std_step_conf": 0.934232234954834,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4919909898635966,
"calib/avg_num_step_conf": 6.81640625,
"calib/ece": 0.43913043478260877,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9407114624505929,
"calib/gap": 0.006639969966211967,
"calib/mean_conf": 0.94901185770751,
"calib/mu_c": 0.9522137404580152,
"calib/mu_w": 0.9455737704918032,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4351778656126483,
"calib/std_conf": 0.09978738196047238,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.39701346389228886,
"calib/step_q_c_n": 817.0,
"calib/step_q_gap": 0.011119067340564726,
"calib/step_q_w": 0.38589439655172414,
"calib/step_q_w_n": 928.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2860.0,
"completions/max_terminated_length": 2860.0,
"completions/mean_length": 474.125,
"completions/mean_terminated_length": 477.8582763671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 191.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.01743357814848423,
"kl": 0.0554962158203125,
"learning_rate": 4.611111111111112e-06,
"loss": -0.0244,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.033780019730329514,
"mask/share_reasoning": 0.8123932480812073,
"mask/share_step_conf": 0.14601418375968933,
"num_tokens": 8095846.0,
"reward": 0.843841552734375,
"reward_std": 0.19271723926067352,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.5511835813522339,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8364993929862976,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.7311309576034546,
"adv/mean_abs_reasoning": 0.5093601942062378,
"adv/mean_abs_step_conf": 0.752097487449646,
"adv/ratio_final_to_reasoning": 1.4353908411371123,
"adv/ratio_step_to_reasoning": 1.4765533231776744,
"adv/std_final_conf": 0.9236873984336853,
"adv/std_reasoning": 0.7753287553787231,
"adv/std_step_conf": 0.934443473815918,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6312058971774194,
"calib/avg_num_step_conf": 6.53125,
"calib/ece": 0.44246031746031755,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.8531746031746031,
"calib/gap": 0.056030745967741935,
"calib/mean_conf": 0.9345238095238095,
"calib/mu_c": 0.962983870967742,
"calib/mu_w": 0.906953125,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.44246031746031755,
"calib/std_conf": 0.1116620975814138,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3825062972292192,
"calib/step_q_c_n": 794.0,
"calib/step_q_gap": 0.027586023880699806,
"calib/step_q_w": 0.3549202733485194,
"calib/step_q_w_n": 878.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2681.0,
"completions/max_terminated_length": 2681.0,
"completions/mean_length": 548.6953125,
"completions/mean_terminated_length": 555.2015991210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.03290172666311264,
"kl": 0.05770111083984375,
"learning_rate": 4.583333333333333e-06,
"loss": -0.0702,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.029511885717511177,
"mask/share_reasoning": 0.8369863033294678,
"mask/share_step_conf": 0.12178307771682739,
"num_tokens": 8345568.0,
"reward": 0.8452932834625244,
"reward_std": 0.2060595154762268,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.5609281063079834,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8359084129333496,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.7096432447433472,
"adv/mean_abs_reasoning": 0.37983816862106323,
"adv/mean_abs_step_conf": 0.7735100984573364,
"adv/ratio_final_to_reasoning": 1.8682778703351066,
"adv/ratio_step_to_reasoning": 2.0364201450987167,
"adv/std_final_conf": 0.9167184829711914,
"adv/std_reasoning": 0.6815183758735657,
"adv/std_step_conf": 0.9348844885826111,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5985863095238095,
"calib/avg_num_step_conf": 6.6796875,
"calib/ece": 0.20118577075098812,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.8260869565217391,
"calib/gap": 0.04194857804232799,
"calib/mean_conf": 0.9274308300395256,
"calib/mu_c": 0.938042328042328,
"calib/mu_w": 0.89609375,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19079051383399206,
"calib/std_conf": 0.12201260285332109,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37776412776412777,
"calib/step_q_c_n": 1221.0,
"calib/step_q_gap": 0.007661878275375178,
"calib/step_q_w": 0.3701022494887526,
"calib/step_q_w_n": 489.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2194.0,
"completions/max_terminated_length": 2194.0,
"completions/mean_length": 487.57421875,
"completions/mean_terminated_length": 487.57421875,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.0384,
"grad_norm": 0.04586457833647728,
"kl": 0.0617828369140625,
"learning_rate": 4.555555555555556e-06,
"loss": 0.0154,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03654136136174202,
"mask/share_reasoning": 0.8108397722244263,
"mask/share_step_conf": 0.15261885523796082,
"num_tokens": 8573099.0,
"reward": 0.9648346304893494,
"reward_std": 0.16412945091724396,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7703171372413635,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8140394687652588,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.7422738075256348,
"adv/mean_abs_reasoning": 0.40673696994781494,
"adv/mean_abs_step_conf": 0.7679119110107422,
"adv/ratio_final_to_reasoning": 1.8249479697428779,
"adv/ratio_step_to_reasoning": 1.8879815894514498,
"adv/std_final_conf": 0.9150478839874268,
"adv/std_reasoning": 0.6613951921463013,
"adv/std_step_conf": 0.9349683523178101,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6805715057758562,
"calib/avg_num_step_conf": 6.44140625,
"calib/ece": 0.4525000000000001,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.7622950819672131,
"calib/gap": 0.05334797000608005,
"calib/mean_conf": 0.9090573770491803,
"calib/mu_c": 0.937699115044248,
"calib/mu_w": 0.8843511450381679,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.44922131147540995,
"calib/std_conf": 0.13716215120895295,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37116319444444446,
"calib/step_q_c_n": 576.0,
"calib/step_q_gap": 0.06508677319560946,
"calib/step_q_w": 0.306076421248835,
"calib/step_q_w_n": 1073.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2687.0,
"completions/max_terminated_length": 2687.0,
"completions/mean_length": 547.22265625,
"completions/mean_terminated_length": 553.7114868164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.030891193076968193,
"kl": 0.056957244873046875,
"learning_rate": 4.527777777777778e-06,
"loss": 0.0362,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03294394165277481,
"mask/share_reasoning": 0.8250966668128967,
"mask/share_step_conf": 0.13024061918258667,
"num_tokens": 8820284.0,
"reward": 0.8101900219917297,
"reward_std": 0.18450546264648438,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.5339511632919312,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8075225353240967,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.726380467414856,
"adv/mean_abs_reasoning": 0.43823903799057007,
"adv/mean_abs_step_conf": 0.7534304261207581,
"adv/ratio_final_to_reasoning": 1.6574983158631478,
"adv/ratio_step_to_reasoning": 1.7192225265357812,
"adv/std_final_conf": 0.9267221093177795,
"adv/std_reasoning": 0.7014729976654053,
"adv/std_step_conf": 0.9348070025444031,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6114302310571834,
"calib/avg_num_step_conf": 6.2578125,
"calib/ece": 0.4103212851405621,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.7349397590361446,
"calib/gap": 0.047305408545243455,
"calib/mean_conf": 0.892570281124498,
"calib/mu_c": 0.9157480314960631,
"calib/mu_w": 0.8684426229508196,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3964257028112448,
"calib/std_conf": 0.17416876619184174,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.35076177285318555,
"calib/step_q_c_n": 722.0,
"calib/step_q_gap": 0.03461404558045822,
"calib/step_q_w": 0.31614772727272733,
"calib/step_q_w_n": 880.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2035.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 538.00390625,
"completions/mean_terminated_length": 542.2401733398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.02675260789692402,
"kl": 0.05478668212890625,
"learning_rate": 4.5e-06,
"loss": -0.0246,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.031882841140031815,
"mask/share_reasoning": 0.8331516981124878,
"mask/share_step_conf": 0.12715290486812592,
"num_tokens": 9064901.0,
"reward": 0.8506743907928467,
"reward_std": 0.1840541660785675,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.58075350522995,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8260639309883118,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.7293663024902344,
"adv/mean_abs_reasoning": 0.45697662234306335,
"adv/mean_abs_step_conf": 0.7594219446182251,
"adv/ratio_final_to_reasoning": 1.5960691790983599,
"adv/ratio_step_to_reasoning": 1.661839812996186,
"adv/std_final_conf": 0.9291202425956726,
"adv/std_reasoning": 0.7205585241317749,
"adv/std_step_conf": 0.9344246983528137,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5710066769388804,
"calib/avg_num_step_conf": 6.6640625,
"calib/ece": 0.36484000000000005,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.664,
"calib/gap": 0.047203389830508535,
"calib/mean_conf": 0.85772,
"calib/mu_c": 0.88,
"calib/mu_w": 0.8327966101694915,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34728000000000003,
"calib/std_conf": 0.21252012045921675,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37581534772182257,
"calib/step_q_c_n": 834.0,
"calib/step_q_gap": 0.06166397157503367,
"calib/step_q_w": 0.3141513761467889,
"calib/step_q_w_n": 872.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2459.0,
"completions/max_terminated_length": 2459.0,
"completions/mean_length": 518.83984375,
"completions/mean_terminated_length": 522.9251708984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0416,
"grad_norm": 0.025498710572719574,
"kl": 0.05701446533203125,
"learning_rate": 4.472222222222223e-06,
"loss": 0.0204,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03186484053730965,
"mask/share_reasoning": 0.8250347375869751,
"mask/share_step_conf": 0.13528786599636078,
"num_tokens": 9303812.0,
"reward": 0.8701096773147583,
"reward_std": 0.1742296665906906,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6058902740478516,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8358914852142334,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.774293065071106,
"adv/mean_abs_reasoning": 0.46777427196502686,
"adv/mean_abs_step_conf": 0.7591378688812256,
"adv/ratio_final_to_reasoning": 1.6552707394924788,
"adv/ratio_step_to_reasoning": 1.6228722150370478,
"adv/std_final_conf": 0.935425341129303,
"adv/std_reasoning": 0.7205560803413391,
"adv/std_step_conf": 0.9340584874153137,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.51763916015625,
"calib/avg_num_step_conf": 6.7578125,
"calib/ece": 0.4000390625,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.58984375,
"calib/gap": 0.010703125000000036,
"calib/mean_conf": 0.8308984374999999,
"calib/mu_c": 0.83625,
"calib/mu_w": 0.825546875,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36546875,
"calib/std_conf": 0.21778082947095823,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3220449172576832,
"calib/step_q_c_n": 846.0,
"calib/step_q_gap": -0.025658702651819076,
"calib/step_q_w": 0.34770361990950227,
"calib/step_q_w_n": 884.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1865.0,
"completions/max_terminated_length": 1865.0,
"completions/mean_length": 523.01171875,
"completions/mean_terminated_length": 525.0628051757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.025849351659417152,
"kl": 0.06219482421875,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0527,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.033173397183418274,
"mask/share_reasoning": 0.8261204957962036,
"mask/share_step_conf": 0.13679982721805573,
"num_tokens": 9544463.0,
"reward": 0.8790460824966431,
"reward_std": 0.15888527035713196,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.5984293222427368,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8596628904342651,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.7439480423927307,
"adv/mean_abs_reasoning": 0.42733556032180786,
"adv/mean_abs_step_conf": 0.7698180675506592,
"adv/ratio_final_to_reasoning": 1.7408989830673014,
"adv/ratio_step_to_reasoning": 1.8014369479828511,
"adv/std_final_conf": 0.9251158237457275,
"adv/std_reasoning": 0.701311469078064,
"adv/std_step_conf": 0.9340131282806396,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6641814975148307,
"calib/avg_num_step_conf": 5.9296875,
"calib/ece": 0.16066666666666657,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5137254901960784,
"calib/gap": 0.11002164502164502,
"calib/mean_conf": 0.8019999999999999,
"calib/mu_c": 0.8304761904761905,
"calib/mu_w": 0.7204545454545455,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11074509803921559,
"calib/std_conf": 0.24876542222366363,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3597395348837209,
"calib/step_q_c_n": 1075.0,
"calib/step_q_gap": 0.01599235655414971,
"calib/step_q_w": 0.3437471783295712,
"calib/step_q_w_n": 443.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1069.0,
"completions/max_terminated_length": 1069.0,
"completions/mean_length": 463.296875,
"completions/mean_terminated_length": 465.1137390136719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.05439276620745659,
"kl": 0.080596923828125,
"learning_rate": 4.416666666666667e-06,
"loss": -0.0363,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03658699989318848,
"mask/share_reasoning": 0.8209108114242554,
"mask/share_step_conf": 0.13859596848487854,
"num_tokens": 9770315.0,
"reward": 0.9843002557754517,
"reward_std": 0.14363425970077515,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7817285060882568,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8399968147277832,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.7876873016357422,
"adv/mean_abs_reasoning": 0.45011699199676514,
"adv/mean_abs_step_conf": 0.7809913158416748,
"adv/ratio_final_to_reasoning": 1.749961267050774,
"adv/ratio_step_to_reasoning": 1.7350851661411786,
"adv/std_final_conf": 0.923464834690094,
"adv/std_reasoning": 0.7013947367668152,
"adv/std_step_conf": 0.9337511658668518,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6822884207902281,
"calib/avg_num_step_conf": 6.28515625,
"calib/ece": 0.2839607843137254,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5568627450980392,
"calib/gap": 0.11492583821513147,
"calib/mean_conf": 0.8147058823529412,
"calib/mu_c": 0.8656338028169013,
"calib/mu_w": 0.7507079646017698,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27090196078431367,
"calib/std_conf": 0.2344367713677176,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3854927884615385,
"calib/step_q_c_n": 832.0,
"calib/step_q_gap": 0.020280433249183327,
"calib/step_q_w": 0.3652123552123552,
"calib/step_q_w_n": 777.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2811.0,
"completions/max_terminated_length": 2811.0,
"completions/mean_length": 445.33203125,
"completions/mean_terminated_length": 445.33203125,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.0448,
"grad_norm": 0.03572205826640129,
"kl": 0.067230224609375,
"learning_rate": 4.388888888888889e-06,
"loss": 0.058,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03780219703912735,
"mask/share_reasoning": 0.8130389451980591,
"mask/share_step_conf": 0.14915883541107178,
"num_tokens": 9988688.0,
"reward": 0.926224946975708,
"reward_std": 0.15658482909202576,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6858199238777161,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8564735651016235,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.7961469888687134,
"adv/mean_abs_reasoning": 0.6136384606361389,
"adv/mean_abs_step_conf": 0.7659261226654053,
"adv/ratio_final_to_reasoning": 1.2974202888837407,
"adv/ratio_step_to_reasoning": 1.248171638184795,
"adv/std_final_conf": 0.9268906712532043,
"adv/std_reasoning": 0.8099459409713745,
"adv/std_step_conf": 0.9335736632347107,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6772471364783537,
"calib/avg_num_step_conf": 5.8515625,
"calib/ece": 0.2553149606299212,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.562992125984252,
"calib/gap": 0.15169934640522875,
"calib/mean_conf": 0.8013779527559054,
"calib/mu_c": 0.8616993464052288,
"calib/mu_w": 0.7100000000000001,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22716535433070859,
"calib/std_conf": 0.26301116497165244,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3862155688622755,
"calib/step_q_c_n": 835.0,
"calib/step_q_gap": 0.012082838847192579,
"calib/step_q_w": 0.3741327300150829,
"calib/step_q_w_n": 663.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2298.0,
"completions/max_terminated_length": 2298.0,
"completions/mean_length": 501.875,
"completions/mean_terminated_length": 501.875,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.03204227611422539,
"kl": 0.05794525146484375,
"learning_rate": 4.361111111111112e-06,
"loss": -0.0227,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03475869074463844,
"mask/share_reasoning": 0.8334618210792542,
"mask/share_step_conf": 0.1317795068025589,
"num_tokens": 10222392.0,
"reward": 0.9406875967979431,
"reward_std": 0.20013371109962463,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7148027420043945,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8493848443031311,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.6989273428916931,
"adv/mean_abs_reasoning": 0.3648616373538971,
"adv/mean_abs_step_conf": 0.7579975128173828,
"adv/ratio_final_to_reasoning": 1.9155955884004583,
"adv/ratio_step_to_reasoning": 2.0774930417860404,
"adv/std_final_conf": 0.8844950199127197,
"adv/std_reasoning": 0.6611524224281311,
"adv/std_step_conf": 0.9328915476799011,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7685390490268539,
"calib/avg_num_step_conf": 6.52734375,
"calib/ece": 0.3349803921568627,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6392156862745098,
"calib/gap": 0.23194382852919448,
"calib/mean_conf": 0.8173333333333332,
"calib/mu_c": 0.9373983739837399,
"calib/mu_w": 0.7054545454545454,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3349803921568627,
"calib/std_conf": 0.27277503958797666,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4172826086956522,
"calib/step_q_c_n": 736.0,
"calib/step_q_gap": 0.04115426644966291,
"calib/step_q_w": 0.3761283422459893,
"calib/step_q_w_n": 935.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2344.0,
"completions/max_terminated_length": 2344.0,
"completions/mean_length": 528.828125,
"completions/mean_terminated_length": 528.828125,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.051673296838998795,
"kl": 0.0508880615234375,
"learning_rate": 4.333333333333334e-06,
"loss": 0.0202,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.031590502709150314,
"mask/share_reasoning": 0.83428955078125,
"mask/share_step_conf": 0.13411997258663177,
"num_tokens": 10464092.0,
"reward": 0.9205601811408997,
"reward_std": 0.1538148820400238,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.6768664121627808,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8689414262771606,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.7376207709312439,
"adv/mean_abs_reasoning": 0.5290188193321228,
"adv/mean_abs_step_conf": 0.7491678595542908,
"adv/ratio_final_to_reasoning": 1.3943185837178298,
"adv/ratio_step_to_reasoning": 1.4161459520477975,
"adv/std_final_conf": 0.9091355800628662,
"adv/std_reasoning": 0.792652428150177,
"adv/std_step_conf": 0.9330114722251892,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6921934925151553,
"calib/avg_num_step_conf": 6.4609375,
"calib/ece": 0.3495294117647061,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.7294117647058823,
"calib/gap": 0.13181306445626606,
"calib/mean_conf": 0.8409019607843138,
"calib/mu_c": 0.9018978102189781,
"calib/mu_w": 0.770084745762712,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.32658823529411785,
"calib/std_conf": 0.26638533602857584,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4050833333333333,
"calib/step_q_c_n": 840.0,
"calib/step_q_gap": 0.051410114660114636,
"calib/step_q_w": 0.35367321867321866,
"calib/step_q_w_n": 814.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1282.0,
"completions/max_terminated_length": 1282.0,
"completions/mean_length": 473.85546875,
"completions/mean_terminated_length": 475.7137451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.048,
"grad_norm": 0.03942892327904701,
"kl": 0.0570831298828125,
"learning_rate": 4.305555555555556e-06,
"loss": -0.0324,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03653804212808609,
"mask/share_reasoning": 0.8149880170822144,
"mask/share_step_conf": 0.14456769824028015,
"num_tokens": 10690447.0,
"reward": 0.912110447883606,
"reward_std": 0.1834811270236969,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6512120962142944,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8667587637901306,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.7239193916320801,
"adv/mean_abs_reasoning": 0.3032967746257782,
"adv/mean_abs_step_conf": 0.7477271556854248,
"adv/ratio_final_to_reasoning": 2.386835113974706,
"adv/ratio_step_to_reasoning": 2.465331708878229,
"adv/std_final_conf": 0.8901360034942627,
"adv/std_reasoning": 0.5960788130760193,
"adv/std_step_conf": 0.9325743913650513,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6502356637863316,
"calib/avg_num_step_conf": 7.09375,
"calib/ece": 0.3412096774193549,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6895161290322581,
"calib/gap": 0.0969939774810159,
"calib/mean_conf": 0.8285483870967743,
"calib/mu_c": 0.8731343283582089,
"calib/mu_w": 0.776140350877193,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3147177419354839,
"calib/std_conf": 0.2664240300493203,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3975352112676057,
"calib/step_q_c_n": 852.0,
"calib/step_q_gap": 0.030522763134825603,
"calib/step_q_w": 0.3670124481327801,
"calib/step_q_w_n": 964.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3039.0,
"completions/max_terminated_length": 3039.0,
"completions/mean_length": 558.02734375,
"completions/mean_terminated_length": 558.02734375,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.037199702113866806,
"kl": 0.04779052734375,
"learning_rate": 4.277777777777778e-06,
"loss": 0.0717,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.0372079461812973,
"mask/share_reasoning": 0.8186650276184082,
"mask/share_step_conf": 0.1441270411014557,
"num_tokens": 10938070.0,
"reward": 0.8839125633239746,
"reward_std": 0.15215706825256348,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6255718469619751,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8438156247138977,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.7109636068344116,
"adv/mean_abs_reasoning": 0.4669240117073059,
"adv/mean_abs_step_conf": 0.7571384906768799,
"adv/ratio_final_to_reasoning": 1.522653770224358,
"adv/ratio_step_to_reasoning": 1.6215454157270812,
"adv/std_final_conf": 0.8986199498176575,
"adv/std_reasoning": 0.7392911314964294,
"adv/std_step_conf": 0.9324113130569458,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7390342439399769,
"calib/avg_num_step_conf": 6.8046875,
"calib/ece": 0.2174103585657371,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5338645418326693,
"calib/gap": 0.2485314864691549,
"calib/mean_conf": 0.7427490039840637,
"calib/mu_c": 0.8546376811594203,
"calib/mu_w": 0.6061061946902654,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20517928286852594,
"calib/std_conf": 0.30816642672872885,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3618954248366013,
"calib/step_q_c_n": 918.0,
"calib/step_q_gap": 0.029176978234659534,
"calib/step_q_w": 0.3327184466019418,
"calib/step_q_w_n": 824.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2749.0,
"completions/max_terminated_length": 2749.0,
"completions/mean_length": 526.5,
"completions/mean_terminated_length": 530.6456909179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.04363685101270676,
"kl": 0.05706024169921875,
"learning_rate": 4.25e-06,
"loss": -0.0684,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.032928965985774994,
"mask/share_reasoning": 0.8188336491584778,
"mask/share_step_conf": 0.14042489230632782,
"num_tokens": 11178830.0,
"reward": 0.9456167817115784,
"reward_std": 0.15967297554016113,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7287996411323547,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.858527660369873,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.7372201681137085,
"adv/mean_abs_reasoning": 0.5289657115936279,
"adv/mean_abs_step_conf": 0.7410778999328613,
"adv/ratio_final_to_reasoning": 1.3937012399020483,
"adv/ratio_step_to_reasoning": 1.400994211326473,
"adv/std_final_conf": 0.9006651043891907,
"adv/std_reasoning": 0.7576570510864258,
"adv/std_step_conf": 0.933323860168457,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7045367656429208,
"calib/avg_num_step_conf": 5.6875,
"calib/ece": 0.23976095617529883,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4581673306772908,
"calib/gap": 0.21290811775200735,
"calib/mean_conf": 0.6956972111553785,
"calib/mu_c": 0.7957894736842106,
"calib/mu_w": 0.5828813559322032,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20278884462151398,
"calib/std_conf": 0.3239236112549953,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3789867841409692,
"calib/step_q_c_n": 681.0,
"calib/step_q_gap": 0.040986784140969124,
"calib/step_q_w": 0.3380000000000001,
"calib/step_q_w_n": 775.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2967.0,
"completions/max_terminated_length": 2967.0,
"completions/mean_length": 458.0234375,
"completions/mean_terminated_length": 463.4545593261719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.0512,
"grad_norm": 0.055858712643384933,
"kl": 0.08083343505859375,
"learning_rate": 4.222222222222223e-06,
"loss": -0.0699,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03893757984042168,
"mask/share_reasoning": 0.8088648319244385,
"mask/share_step_conf": 0.14047878980636597,
"num_tokens": 11399772.0,
"reward": 0.9301211833953857,
"reward_std": 0.15484796464443207,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7103937268257141,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8498485088348389,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.6789600253105164,
"adv/mean_abs_reasoning": 0.46352851390838623,
"adv/mean_abs_step_conf": 0.7762776017189026,
"adv/ratio_final_to_reasoning": 1.4647643131717005,
"adv/ratio_step_to_reasoning": 1.6747138060040239,
"adv/std_final_conf": 0.8645612597465515,
"adv/std_reasoning": 0.7206313610076904,
"adv/std_step_conf": 0.9317847490310669,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7854982817869415,
"calib/avg_num_step_conf": 6.4765625,
"calib/ece": 0.1580566801619434,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.4939271255060729,
"calib/gap": 0.31128934707903766,
"calib/mean_conf": 0.7016194331983806,
"calib/mu_c": 0.8238666666666666,
"calib/mu_w": 0.512577319587629,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.12619433198380575,
"calib/std_conf": 0.3236691982653765,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.37662556053811663,
"calib/step_q_c_n": 892.0,
"calib/step_q_gap": 0.035450625812268044,
"calib/step_q_w": 0.3411749347258486,
"calib/step_q_w_n": 766.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3071.0,
"completions/max_terminated_length": 3071.0,
"completions/mean_length": 504.3828125,
"completions/mean_terminated_length": 506.3608093261719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.03892216458916664,
"kl": 0.057281494140625,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0143,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03590047359466553,
"mask/share_reasoning": 0.8158446550369263,
"mask/share_step_conf": 0.14434868097305298,
"num_tokens": 11633430.0,
"reward": 0.9651601314544678,
"reward_std": 0.15499456226825714,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7683327794075012,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8510499596595764,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.6920836567878723,
"adv/mean_abs_reasoning": 0.39064478874206543,
"adv/mean_abs_step_conf": 0.7383641600608826,
"adv/ratio_final_to_reasoning": 1.7716444113243774,
"adv/ratio_step_to_reasoning": 1.8901164980045566,
"adv/std_final_conf": 0.8895038366317749,
"adv/std_reasoning": 0.6613850593566895,
"adv/std_step_conf": 0.9324697256088257,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7615255376344086,
"calib/avg_num_step_conf": 5.87890625,
"calib/ece": 0.16195219123505986,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4302788844621514,
"calib/gap": 0.3052594086021505,
"calib/mean_conf": 0.6362151394422311,
"calib/mu_c": 0.7529677419354839,
"calib/mu_w": 0.4477083333333334,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.09031872509960168,
"calib/std_conf": 0.3449678337246269,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3922669735327963,
"calib/step_q_c_n": 869.0,
"calib/step_q_gap": 0.06339904900449439,
"calib/step_q_w": 0.32886792452830194,
"calib/step_q_w_n": 636.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2308.0,
"completions/max_terminated_length": 2308.0,
"completions/mean_length": 470.5546875,
"completions/mean_terminated_length": 472.4000244140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.0452268086373806,
"kl": 0.0612335205078125,
"learning_rate": 4.166666666666667e-06,
"loss": -0.028,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03737880289554596,
"mask/share_reasoning": 0.8206923604011536,
"mask/share_step_conf": 0.13802257180213928,
"num_tokens": 11859252.0,
"reward": 0.9782302379608154,
"reward_std": 0.15078993141651154,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7732542753219604,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8652373552322388,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.6727511882781982,
"adv/mean_abs_reasoning": 0.41828539967536926,
"adv/mean_abs_step_conf": 0.7507631778717041,
"adv/ratio_final_to_reasoning": 1.608354460376383,
"adv/ratio_step_to_reasoning": 1.7948586741358183,
"adv/std_final_conf": 0.8566681146621704,
"adv/std_reasoning": 0.7013798952102661,
"adv/std_step_conf": 0.9328375458717346,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7685499186109604,
"calib/avg_num_step_conf": 5.671875,
"calib/ece": 0.15409638554216873,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.3895582329317269,
"calib/gap": 0.33237791644058595,
"calib/mean_conf": 0.6095983935742971,
"calib/mu_c": 0.7390789473684211,
"calib/mu_w": 0.4067010309278351,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07662650602409644,
"calib/std_conf": 0.36308893353011723,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3618518518518518,
"calib/step_q_c_n": 783.0,
"calib/step_q_gap": 0.06986380999833908,
"calib/step_q_w": 0.29198804185351274,
"calib/step_q_w_n": 669.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2437.0,
"completions/max_terminated_length": 2437.0,
"completions/mean_length": 479.69921875,
"completions/mean_terminated_length": 481.5804138183594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.0544,
"grad_norm": 0.05431721359491348,
"kl": 0.06055450439453125,
"learning_rate": 4.138888888888889e-06,
"loss": 0.0452,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.0363931804895401,
"mask/share_reasoning": 0.8310154676437378,
"mask/share_step_conf": 0.1286850869655609,
"num_tokens": 12091351.0,
"reward": 0.9718501567840576,
"reward_std": 0.16381707787513733,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7668848037719727,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8635340929031372,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.6627511978149414,
"adv/mean_abs_reasoning": 0.4197141230106354,
"adv/mean_abs_step_conf": 0.7539057731628418,
"adv/ratio_final_to_reasoning": 1.579053840411626,
"adv/ratio_step_to_reasoning": 1.796236370973245,
"adv/std_final_conf": 0.8806871175765991,
"adv/std_reasoning": 0.7013139724731445,
"adv/std_step_conf": 0.9320465922355652,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6909056316590563,
"calib/avg_num_step_conf": 5.515625,
"calib/ece": 0.2622529644268774,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.43873517786561267,
"calib/gap": 0.28380289193302904,
"calib/mean_conf": 0.5797233201581028,
"calib/mu_c": 0.6616111111111113,
"calib/mu_w": 0.3778082191780822,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06525691699604735,
"calib/std_conf": 0.3985074953417087,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3857852077001013,
"calib/step_q_c_n": 987.0,
"calib/step_q_gap": 0.07912638417068957,
"calib/step_q_w": 0.3066588235294117,
"calib/step_q_w_n": 425.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1642.0,
"completions/max_terminated_length": 1642.0,
"completions/mean_length": 445.98046875,
"completions/mean_terminated_length": 447.72943115234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.08861194550991058,
"kl": 0.072235107421875,
"learning_rate": 4.111111111111111e-06,
"loss": -0.0165,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.038540616631507874,
"mask/share_reasoning": 0.8251634240150452,
"mask/share_step_conf": 0.13238969445228577,
"num_tokens": 12313474.0,
"reward": 0.969064474105835,
"reward_std": 0.1488306224346161,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.7264589667320251,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8726074695587158,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.5378075838088989,
"adv/mean_abs_reasoning": 0.3977751135826111,
"adv/mean_abs_step_conf": 0.7320243120193481,
"adv/ratio_final_to_reasoning": 1.3520392941757164,
"adv/ratio_step_to_reasoning": 1.8402969090405883,
"adv/std_final_conf": 0.7663177251815796,
"adv/std_reasoning": 0.6815900802612305,
"adv/std_step_conf": 0.9331000447273254,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6817835365853658,
"calib/avg_num_step_conf": 5.8671875,
"calib/ece": 0.2257936507936507,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6547619047619048,
"calib/gap": 0.24677383592017754,
"calib/mean_conf": 0.7565079365079367,
"calib/mu_c": 0.8426829268292684,
"calib/mu_w": 0.5959090909090908,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16575396825396815,
"calib/std_conf": 0.3512693929805845,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3837,
"calib/step_q_c_n": 1000.0,
"calib/step_q_gap": 0.03826175298804774,
"calib/step_q_w": 0.34543824701195225,
"calib/step_q_w_n": 502.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2711.0,
"completions/max_terminated_length": 2711.0,
"completions/mean_length": 491.625,
"completions/mean_terminated_length": 493.552978515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.06184534728527069,
"kl": 0.0595245361328125,
"learning_rate": 4.083333333333334e-06,
"loss": -0.0531,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.034944549202919006,
"mask/share_reasoning": 0.830845832824707,
"mask/share_step_conf": 0.13030338287353516,
"num_tokens": 12545154.0,
"reward": 0.9501216411590576,
"reward_std": 0.14969132840633392,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7386132478713989,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.83663010597229,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.38943132758140564,
"adv/mean_abs_reasoning": 0.22378817200660706,
"adv/mean_abs_step_conf": 0.7389417290687561,
"adv/ratio_final_to_reasoning": 1.7401783306488074,
"adv/ratio_step_to_reasoning": 3.301969547554729,
"adv/std_final_conf": 0.6646291017532349,
"adv/std_reasoning": 0.4960247576236725,
"adv/std_step_conf": 0.9332090020179749,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7655405405405405,
"calib/avg_num_step_conf": 5.46484375,
"calib/ece": 0.1812204724409449,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7952755905511811,
"calib/gap": 0.2853543543543544,
"calib/mean_conf": 0.873976377952756,
"calib/mu_c": 0.9571111111111111,
"calib/mu_w": 0.6717567567567567,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1732677165354331,
"calib/std_conf": 0.25683938927458877,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.43114044350580777,
"calib/step_q_c_n": 947.0,
"calib/step_q_gap": 0.0920917709394361,
"calib/step_q_w": 0.33904867256637167,
"calib/step_q_w_n": 452.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2314.0,
"completions/max_terminated_length": 2314.0,
"completions/mean_length": 420.96875,
"completions/mean_terminated_length": 424.2834777832031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.0576,
"grad_norm": 0.04072105139493942,
"kl": 0.06040191650390625,
"learning_rate": 4.055555555555556e-06,
"loss": -0.04,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04213773459196091,
"mask/share_reasoning": 0.8122613430023193,
"mask/share_step_conf": 0.13778847455978394,
"num_tokens": 12759154.0,
"reward": 1.0076714754104614,
"reward_std": 0.11803492158651352,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.8116816282272339,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8645989298820496,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.5672956109046936,
"adv/mean_abs_reasoning": 0.4686982035636902,
"adv/mean_abs_step_conf": 0.7626444101333618,
"adv/ratio_final_to_reasoning": 1.2103643807280036,
"adv/ratio_step_to_reasoning": 1.6271545406717738,
"adv/std_final_conf": 0.7936248183250427,
"adv/std_reasoning": 0.7205504179000854,
"adv/std_step_conf": 0.9332450032234192,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7190927218344966,
"calib/avg_num_step_conf": 5.30859375,
"calib/ece": 0.2828346456692915,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7244094488188977,
"calib/gap": 0.26269566301096736,
"calib/mean_conf": 0.8136220472440947,
"calib/mu_c": 0.9356617647058825,
"calib/mu_w": 0.6729661016949151,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2805118110236222,
"calib/std_conf": 0.3073064829686301,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4221771771771771,
"calib/step_q_c_n": 666.0,
"calib/step_q_gap": 0.0822781872781872,
"calib/step_q_w": 0.3398989898989899,
"calib/step_q_w_n": 693.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2751.0,
"completions/max_terminated_length": 2751.0,
"completions/mean_length": 463.44140625,
"completions/mean_terminated_length": 463.44140625,
"completions/min_length": 97.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.04620984569191933,
"kl": 0.06476593017578125,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0361,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03952161967754364,
"mask/share_reasoning": 0.8331817388534546,
"mask/share_step_conf": 0.12729665637016296,
"num_tokens": 12985619.0,
"reward": 0.9384989142417908,
"reward_std": 0.17439204454421997,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7045695185661316,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.86774080991745,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.5335018634796143,
"adv/mean_abs_reasoning": 0.4167119562625885,
"adv/mean_abs_step_conf": 0.752956748008728,
"adv/ratio_final_to_reasoning": 1.2802653138741027,
"adv/ratio_step_to_reasoning": 1.8068997941932265,
"adv/std_final_conf": 0.7589111328125,
"adv/std_reasoning": 0.6816851496696472,
"adv/std_step_conf": 0.9334322810173035,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6513919299343133,
"calib/avg_num_step_conf": 5.55859375,
"calib/ece": 0.35795275590551184,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.8503937007874016,
"calib/gap": 0.1445192367844853,
"calib/mean_conf": 0.9029133858267718,
"calib/mu_c": 0.9683453237410071,
"calib/mu_w": 0.8238260869565218,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3568110236220473,
"calib/std_conf": 0.22530333091896765,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.44694148936170214,
"calib/step_q_c_n": 752.0,
"calib/step_q_gap": 0.05115907505469758,
"calib/step_q_w": 0.39578241430700456,
"calib/step_q_w_n": 671.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1401.0,
"completions/max_terminated_length": 1401.0,
"completions/mean_length": 444.484375,
"completions/mean_terminated_length": 446.22747802734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.06583019345998764,
"kl": 0.0554656982421875,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0506,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.037847816944122314,
"mask/share_reasoning": 0.825727105140686,
"mask/share_step_conf": 0.13251882791519165,
"num_tokens": 13206247.0,
"reward": 0.9049012064933777,
"reward_std": 0.16697609424591064,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6415327787399292,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8612383604049683,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.5082446932792664,
"adv/mean_abs_reasoning": 0.4816570580005646,
"adv/mean_abs_step_conf": 0.7286078333854675,
"adv/ratio_final_to_reasoning": 1.0552003439730984,
"adv/ratio_step_to_reasoning": 1.5127107996922855,
"adv/std_final_conf": 0.761055588722229,
"adv/std_reasoning": 0.7574499249458313,
"adv/std_step_conf": 0.9334962964057922,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5770782889426957,
"calib/avg_num_step_conf": 5.421875,
"calib/ece": 0.2666535433070866,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9409448818897638,
"calib/gap": 0.06645755374568973,
"calib/mean_conf": 0.9561811023622048,
"calib/mu_c": 0.9763276836158192,
"calib/mu_w": 0.9098701298701295,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.262992125984252,
"calib/std_conf": 0.14413734249370286,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.446548488008342,
"calib/step_q_c_n": 959.0,
"calib/step_q_gap": 0.05487016633002029,
"calib/step_q_w": 0.39167832167832173,
"calib/step_q_w_n": 429.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2395.0,
"completions/max_terminated_length": 2395.0,
"completions/mean_length": 465.83984375,
"completions/mean_terminated_length": 465.83984375,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.0608,
"grad_norm": 0.030439382418990135,
"kl": 0.0545654296875,
"learning_rate": 3.972222222222223e-06,
"loss": 0.0223,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03719177842140198,
"mask/share_reasoning": 0.8320133686065674,
"mask/share_step_conf": 0.13079488277435303,
"num_tokens": 13432294.0,
"reward": 0.9639207720756531,
"reward_std": 0.19327566027641296,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.7231066226959229,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8680161833763123,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.5480987429618835,
"adv/mean_abs_reasoning": 0.4223048686981201,
"adv/mean_abs_step_conf": 0.7950807809829712,
"adv/ratio_final_to_reasoning": 1.297874553640739,
"adv/ratio_step_to_reasoning": 1.8827175339797604,
"adv/std_final_conf": 0.7782388925552368,
"adv/std_reasoning": 0.6815927624702454,
"adv/std_step_conf": 0.933213472366333,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.570188492063492,
"calib/avg_num_step_conf": 6.87890625,
"calib/ece": 0.3909765625,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.8671875,
"calib/gap": 0.0502083333333333,
"calib/mean_conf": 0.9107421875000001,
"calib/mu_c": 0.9327083333333334,
"calib/mu_w": 0.8825000000000001,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.369609375,
"calib/std_conf": 0.21828664304696896,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40339342523860017,
"calib/step_q_c_n": 943.0,
"calib/step_q_gap": 0.09268437878383246,
"calib/step_q_w": 0.3107090464547677,
"calib/step_q_w_n": 818.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2102.0,
"completions/max_terminated_length": 2102.0,
"completions/mean_length": 563.21875,
"completions/mean_terminated_length": 565.427490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.04508507251739502,
"kl": 0.046657562255859375,
"learning_rate": 3.944444444444445e-06,
"loss": 0.0315,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03245016932487488,
"mask/share_reasoning": 0.8317633867263794,
"mask/share_step_conf": 0.13188019394874573,
"num_tokens": 13682798.0,
"reward": 0.8925913572311401,
"reward_std": 0.17667566239833832,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6096965074539185,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.862986147403717,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.6050117611885071,
"adv/mean_abs_reasoning": 0.5482821464538574,
"adv/mean_abs_step_conf": 0.7693145275115967,
"adv/ratio_final_to_reasoning": 1.1034679226773325,
"adv/ratio_step_to_reasoning": 1.4031362000154806,
"adv/std_final_conf": 0.8122844099998474,
"adv/std_reasoning": 0.7928110361099243,
"adv/std_step_conf": 0.9340978264808655,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5672873045322026,
"calib/avg_num_step_conf": 5.9453125,
"calib/ece": 0.3519444444444445,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9563492063492064,
"calib/gap": 0.052087198515769706,
"calib/mean_conf": 0.9630555555555557,
"calib/mu_c": 0.9833116883116881,
"calib/mu_w": 0.9312244897959184,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3519444444444445,
"calib/std_conf": 0.13237373771783348,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.44518652226233457,
"calib/step_q_c_n": 831.0,
"calib/step_q_gap": 0.05318941661834037,
"calib/step_q_w": 0.3919971056439942,
"calib/step_q_w_n": 691.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3009.0,
"completions/max_terminated_length": 3009.0,
"completions/mean_length": 522.1796875,
"completions/mean_terminated_length": 524.2274780273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.04159040376543999,
"kl": 0.051868438720703125,
"learning_rate": 3.916666666666667e-06,
"loss": 0.0383,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.036025967448949814,
"mask/share_reasoning": 0.8390854597091675,
"mask/share_step_conf": 0.12098235636949539,
"num_tokens": 13922724.0,
"reward": 0.8882949352264404,
"reward_std": 0.22001904249191284,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6356261372566223,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8237762451171875,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.5817492008209229,
"adv/mean_abs_reasoning": 0.5058983564376831,
"adv/mean_abs_step_conf": 0.7513135671615601,
"adv/ratio_final_to_reasoning": 1.149932972538888,
"adv/ratio_step_to_reasoning": 1.4851077446702623,
"adv/std_final_conf": 0.7956362366676331,
"adv/std_reasoning": 0.7575187087059021,
"adv/std_step_conf": 0.9340049624443054,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6091150006162948,
"calib/avg_num_step_conf": 5.66015625,
"calib/ece": 0.4308235294117646,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9058823529411765,
"calib/gap": 0.05876124738074695,
"calib/mean_conf": 0.9383529411764706,
"calib/mu_c": 0.9664661654135338,
"calib/mu_w": 0.9077049180327869,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.42380392156862734,
"calib/std_conf": 0.18075124532746079,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4824582701062216,
"calib/step_q_c_n": 659.0,
"calib/step_q_gap": 0.0953316878277406,
"calib/step_q_w": 0.387126582278481,
"calib/step_q_w_n": 790.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1737.0,
"completions/max_terminated_length": 1737.0,
"completions/mean_length": 467.12890625,
"completions/mean_terminated_length": 468.9608154296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.064,
"grad_norm": 0.044570669531822205,
"kl": 0.05500030517578125,
"learning_rate": 3.88888888888889e-06,
"loss": -0.0669,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03605569154024124,
"mask/share_reasoning": 0.8289955258369446,
"mask/share_step_conf": 0.13104252517223358,
"num_tokens": 14151165.0,
"reward": 0.8683550953865051,
"reward_std": 0.2017507404088974,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.5711711049079895,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8624140620231628,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.5088446736335754,
"adv/mean_abs_reasoning": 0.36581867933273315,
"adv/mean_abs_step_conf": 0.7902446389198303,
"adv/ratio_final_to_reasoning": 1.3909750988159681,
"adv/ratio_step_to_reasoning": 2.1602085502065282,
"adv/std_final_conf": 0.7080959677696228,
"adv/std_reasoning": 0.618648886680603,
"adv/std_step_conf": 0.9336047768592834,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5908109989557953,
"calib/avg_num_step_conf": 5.0546875,
"calib/ece": 0.3147244094488189,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.968503937007874,
"calib/gap": 0.05019422206752544,
"calib/mean_conf": 0.9762204724409449,
"calib/mu_c": 0.99301775147929,
"calib/mu_w": 0.9428235294117645,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3127952755905512,
"calib/std_conf": 0.11279504885164993,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5343728018757328,
"calib/step_q_c_n": 853.0,
"calib/step_q_gap": 0.09378323271473499,
"calib/step_q_w": 0.4405895691609978,
"calib/step_q_w_n": 441.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2524.0,
"completions/max_terminated_length": 2524.0,
"completions/mean_length": 418.03515625,
"completions/mean_terminated_length": 418.03515625,
"completions/min_length": 124.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.05238793045282364,
"kl": 0.06305313110351562,
"learning_rate": 3.861111111111112e-06,
"loss": -0.0263,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04262950271368027,
"mask/share_reasoning": 0.8225347399711609,
"mask/share_step_conf": 0.13483577966690063,
"num_tokens": 14362246.0,
"reward": 0.9320245981216431,
"reward_std": 0.17481377720832825,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.6849405765533447,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8486398458480835,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.627860426902771,
"adv/mean_abs_reasoning": 0.5182861089706421,
"adv/mean_abs_step_conf": 0.7592391967773438,
"adv/ratio_final_to_reasoning": 1.2114166597089633,
"adv/ratio_step_to_reasoning": 1.4649036191327873,
"adv/std_final_conf": 0.8408026695251465,
"adv/std_reasoning": 0.7575954794883728,
"adv/std_step_conf": 0.9342003464698792,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6059811827956989,
"calib/avg_num_step_conf": 5.703125,
"calib/ece": 0.36788844621513944,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.952191235059761,
"calib/gap": 0.027135080645161014,
"calib/mean_conf": 0.9739442231075698,
"calib/mu_c": 0.9843225806451612,
"calib/mu_w": 0.9571875000000002,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.36215139442231076,
"calib/std_conf": 0.111826473768745,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4777091377091377,
"calib/step_q_c_n": 777.0,
"calib/step_q_gap": 0.10100342760664865,
"calib/step_q_w": 0.37670571010248904,
"calib/step_q_w_n": 683.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2620.0,
"completions/max_terminated_length": 2620.0,
"completions/mean_length": 496.6796875,
"completions/mean_terminated_length": 498.6274719238281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.05509721860289574,
"kl": 0.06145477294921875,
"learning_rate": 3.833333333333334e-06,
"loss": 0.0135,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03653930127620697,
"mask/share_reasoning": 0.8363149166107178,
"mask/share_step_conf": 0.12323950231075287,
"num_tokens": 14596476.0,
"reward": 0.888029932975769,
"reward_std": 0.2173173725605011,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.6231218576431274,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8365317583084106,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.5727392435073853,
"adv/mean_abs_reasoning": 0.4926705062389374,
"adv/mean_abs_step_conf": 0.7514458298683167,
"adv/ratio_final_to_reasoning": 1.1625198510048738,
"adv/ratio_step_to_reasoning": 1.5252502846270999,
"adv/std_final_conf": 0.7952468991279602,
"adv/std_reasoning": 0.7574864029884338,
"adv/std_step_conf": 0.9340078234672546,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7097506437186611,
"calib/avg_num_step_conf": 5.75,
"calib/ece": 0.30972111553784853,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.8764940239043825,
"calib/gap": 0.12637755793467942,
"calib/mean_conf": 0.9352191235059761,
"calib/mu_c": 0.9825477707006369,
"calib/mu_w": 0.8561702127659575,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30972111553784853,
"calib/std_conf": 0.17461012858597133,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4136817653890824,
"calib/step_q_c_n": 861.0,
"calib/step_q_gap": 0.07517112709121004,
"calib/step_q_w": 0.33851063829787237,
"calib/step_q_w_n": 611.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2637.0,
"completions/max_terminated_length": 2637.0,
"completions/mean_length": 528.4921875,
"completions/mean_terminated_length": 532.653564453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.0672,
"grad_norm": 0.049538128077983856,
"kl": 0.06967926025390625,
"learning_rate": 3.8055555555555556e-06,
"loss": 0.0515,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03559984266757965,
"mask/share_reasoning": 0.8368204832077026,
"mask/share_step_conf": 0.1197671964764595,
"num_tokens": 14840410.0,
"reward": 0.9256917834281921,
"reward_std": 0.19786253571510315,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6848984360694885,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8477350473403931,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.5772940516471863,
"adv/mean_abs_reasoning": 0.48638617992401123,
"adv/mean_abs_step_conf": 0.7385168075561523,
"adv/ratio_final_to_reasoning": 1.1869047178465837,
"adv/ratio_step_to_reasoning": 1.5183753939545153,
"adv/std_final_conf": 0.7961284518241882,
"adv/std_reasoning": 0.739449679851532,
"adv/std_step_conf": 0.9341706037521362,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6242081447963801,
"calib/avg_num_step_conf": 5.6015625,
"calib/ece": 0.28991935483870984,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9274193548387096,
"calib/gap": 0.06342081447963799,
"calib/mean_conf": 0.9569354838709677,
"calib/mu_c": 0.9768823529411764,
"calib/mu_w": 0.9134615384615384,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2806854838709679,
"calib/std_conf": 0.15405632306523084,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4110068649885584,
"calib/step_q_c_n": 874.0,
"calib/step_q_gap": 0.10820329355998692,
"calib/step_q_w": 0.30280357142857145,
"calib/step_q_w_n": 560.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2770.0,
"completions/max_terminated_length": 2770.0,
"completions/mean_length": 465.09375,
"completions/mean_terminated_length": 470.60870361328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.05453026294708252,
"kl": 0.0635986328125,
"learning_rate": 3.777777777777778e-06,
"loss": -0.0028,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.0382472425699234,
"mask/share_reasoning": 0.8235123157501221,
"mask/share_step_conf": 0.12652164697647095,
"num_tokens": 15063250.0,
"reward": 0.9168107509613037,
"reward_std": 0.2164766490459442,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.6919308304786682,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8159095048904419,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.4365313649177551,
"adv/mean_abs_reasoning": 0.26889199018478394,
"adv/mean_abs_step_conf": 0.7722085118293762,
"adv/ratio_final_to_reasoning": 1.623445029425267,
"adv/ratio_step_to_reasoning": 2.8718167145801203,
"adv/std_final_conf": 0.6810697913169861,
"adv/std_reasoning": 0.5482594966888428,
"adv/std_step_conf": 0.9330787658691406,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7023010546500479,
"calib/avg_num_step_conf": 5.09765625,
"calib/ece": 0.4046062992125985,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.968503937007874,
"calib/gap": 0.013262384148290285,
"calib/mean_conf": 0.986732283464567,
"calib/mu_c": 0.9922147651006712,
"calib/mu_w": 0.9789523809523809,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.40236220472440953,
"calib/std_conf": 0.06243067951880301,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.44182284980744546,
"calib/step_q_c_n": 779.0,
"calib/step_q_gap": 0.0518988954348219,
"calib/step_q_w": 0.38992395437262356,
"calib/step_q_w_n": 526.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2368.0,
"completions/max_terminated_length": 2368.0,
"completions/mean_length": 401.34375,
"completions/mean_terminated_length": 401.34375,
"completions/min_length": 113.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.054482534527778625,
"kl": 0.072052001953125,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0635,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04360418766736984,
"mask/share_reasoning": 0.8223955631256104,
"mask/share_step_conf": 0.13400021195411682,
"num_tokens": 15271018.0,
"reward": 0.8871078491210938,
"reward_std": 0.11454544961452484,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.5952550768852234,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8633356690406799,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.5894931554794312,
"adv/mean_abs_reasoning": 0.4546332061290741,
"adv/mean_abs_step_conf": 0.746368944644928,
"adv/ratio_final_to_reasoning": 1.2966346222235012,
"adv/ratio_step_to_reasoning": 1.641694743329038,
"adv/std_final_conf": 0.7794395685195923,
"adv/std_reasoning": 0.7014214396476746,
"adv/std_step_conf": 0.9329808354377747,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7335336839469071,
"calib/avg_num_step_conf": 5.85546875,
"calib/ece": 0.39201581027667987,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.8695652173913043,
"calib/gap": 0.13963498622589532,
"calib/mean_conf": 0.9099604743083005,
"calib/mu_c": 0.9767424242424243,
"calib/mu_w": 0.837107438016529,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39011857707509884,
"calib/std_conf": 0.24128198126911699,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4097321428571429,
"calib/step_q_c_n": 672.0,
"calib/step_q_gap": 0.13024000259112112,
"calib/step_q_w": 0.27949214026602176,
"calib/step_q_w_n": 827.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2143.0,
"completions/max_terminated_length": 2143.0,
"completions/mean_length": 512.421875,
"completions/mean_terminated_length": 516.4566650390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.0704,
"grad_norm": 0.02887682057917118,
"kl": 0.05440521240234375,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.0044,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.035790372639894485,
"mask/share_reasoning": 0.8343870639801025,
"mask/share_step_conf": 0.12201003730297089,
"num_tokens": 15508550.0,
"reward": 0.883568525314331,
"reward_std": 0.18583741784095764,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6040624380111694,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8622932434082031,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.46097415685653687,
"adv/mean_abs_reasoning": 0.3053887188434601,
"adv/mean_abs_step_conf": 0.7570107579231262,
"adv/ratio_final_to_reasoning": 1.5094668807750842,
"adv/ratio_step_to_reasoning": 2.4788432290164724,
"adv/std_final_conf": 0.690024197101593,
"adv/std_reasoning": 0.5959534645080566,
"adv/std_step_conf": 0.9334073662757874,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7326582789477225,
"calib/avg_num_step_conf": 5.36328125,
"calib/ece": 0.28363281249999994,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.8671875,
"calib/gap": 0.1248556818946377,
"calib/mean_conf": 0.9342578125000001,
"calib/mu_c": 0.9776646706586826,
"calib/mu_w": 0.8528089887640449,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.28277343749999995,
"calib/std_conf": 0.16308619011956482,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.39914218566392473,
"calib/step_q_c_n": 851.0,
"calib/step_q_gap": 0.09987015501258373,
"calib/step_q_w": 0.299272030651341,
"calib/step_q_w_n": 522.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1571.0,
"completions/max_terminated_length": 1571.0,
"completions/mean_length": 485.20703125,
"completions/mean_terminated_length": 487.1098327636719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.038448333740234375,
"kl": 0.05216217041015625,
"learning_rate": 3.694444444444445e-06,
"loss": -0.0571,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.035844773054122925,
"mask/share_reasoning": 0.8427526950836182,
"mask/share_step_conf": 0.11749625205993652,
"num_tokens": 15737771.0,
"reward": 0.9710288047790527,
"reward_std": 0.11427510529756546,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7237683534622192,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8878204822540283,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.5147673487663269,
"adv/mean_abs_reasoning": 0.41767218708992004,
"adv/mean_abs_step_conf": 0.7670242786407471,
"adv/ratio_final_to_reasoning": 1.232467386332103,
"adv/ratio_step_to_reasoning": 1.8364265142596519,
"adv/std_final_conf": 0.7798352837562561,
"adv/std_reasoning": 0.7012434601783752,
"adv/std_step_conf": 0.9333664774894714,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6275355637513171,
"calib/avg_num_step_conf": 5.59375,
"calib/ece": 0.3505600000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.896,
"calib/gap": 0.11449947312961006,
"calib/mean_conf": 0.9345600000000001,
"calib/mu_c": 0.9821917808219178,
"calib/mu_w": 0.8676923076923078,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3505600000000001,
"calib/std_conf": 0.17880941362243766,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4176557863501484,
"calib/step_q_c_n": 674.0,
"calib/step_q_gap": 0.1449249156377473,
"calib/step_q_w": 0.2727308707124011,
"calib/step_q_w_n": 758.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2741.0,
"completions/max_terminated_length": 2741.0,
"completions/mean_length": 482.67578125,
"completions/mean_terminated_length": 484.56866455078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.30053746700286865,
"kl": 0.09393692016601562,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0184,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03930802643299103,
"mask/share_reasoning": 0.8303000926971436,
"mask/share_step_conf": 0.12648558616638184,
"num_tokens": 15965424.0,
"reward": 0.8982089757919312,
"reward_std": 0.16178368031978607,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6389793157577515,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8488449454307556,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.6562919616699219,
"adv/mean_abs_reasoning": 0.4475212097167969,
"adv/mean_abs_step_conf": 0.7461894750595093,
"adv/ratio_final_to_reasoning": 1.4665047095426842,
"adv/ratio_step_to_reasoning": 1.6673834867663981,
"adv/std_final_conf": 0.8225091695785522,
"adv/std_reasoning": 0.7013913989067078,
"adv/std_step_conf": 0.934605598449707,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6312986350299783,
"calib/avg_num_step_conf": 5.828125,
"calib/ece": 0.3744223107569721,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7569721115537849,
"calib/gap": 0.05847620870008918,
"calib/mean_conf": 0.8808764940239044,
"calib/mu_c": 0.908134328358209,
"calib/mu_w": 0.8496581196581198,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36071713147410356,
"calib/std_conf": 0.2134931180143776,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4194698085419736,
"calib/step_q_c_n": 679.0,
"calib/step_q_gap": 0.09960510989498711,
"calib/step_q_w": 0.31986469864698647,
"calib/step_q_w_n": 813.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2466.0,
"completions/max_terminated_length": 2466.0,
"completions/mean_length": 591.515625,
"completions/mean_terminated_length": 596.1732177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.0736,
"grad_norm": 0.08974709361791611,
"kl": 0.2734527587890625,
"learning_rate": 3.638888888888889e-06,
"loss": -0.0409,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.031212469562888145,
"mask/share_reasoning": 0.8525819778442383,
"mask/share_step_conf": 0.10839303582906723,
"num_tokens": 16221348.0,
"reward": 0.8734649419784546,
"reward_std": 0.18273797631263733,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6022570133209229,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8438915014266968,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.6758154034614563,
"adv/mean_abs_reasoning": 0.5166471004486084,
"adv/mean_abs_step_conf": 0.737113356590271,
"adv/ratio_final_to_reasoning": 1.308079350246311,
"adv/ratio_step_to_reasoning": 1.4267250429746536,
"adv/std_final_conf": 0.8379309773445129,
"adv/std_reasoning": 0.7752685546875,
"adv/std_step_conf": 0.9346587061882019,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7898451730418943,
"calib/avg_num_step_conf": 5.84375,
"calib/ece": 0.30862903225806443,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6814516129032258,
"calib/gap": 0.262110330470986,
"calib/mean_conf": 0.8166935483870968,
"calib/mu_c": 0.9456349206349206,
"calib/mu_w": 0.6835245901639346,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.30862903225806443,
"calib/std_conf": 0.28888333892207035,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.45647058823529413,
"calib/step_q_c_n": 629.0,
"calib/step_q_gap": 0.18273356401384078,
"calib/step_q_w": 0.27373702422145335,
"calib/step_q_w_n": 867.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2707.0,
"completions/max_terminated_length": 2707.0,
"completions/mean_length": 525.234375,
"completions/mean_terminated_length": 540.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.04197907820343971,
"kl": 0.05033111572265625,
"learning_rate": 3.6111111111111115e-06,
"loss": -0.1658,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.0343087837100029,
"mask/share_reasoning": 0.8175104260444641,
"mask/share_step_conf": 0.12083704024553299,
"num_tokens": 16462800.0,
"reward": 0.9065558910369873,
"reward_std": 0.20242035388946533,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.6768535375595093,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8448520302772522,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.6989947557449341,
"adv/mean_abs_reasoning": 0.5246307253837585,
"adv/mean_abs_step_conf": 0.737536609172821,
"adv/ratio_final_to_reasoning": 1.3323557350432176,
"adv/ratio_step_to_reasoning": 1.405820462828069,
"adv/std_final_conf": 0.8714163899421692,
"adv/std_reasoning": 0.7754472494125366,
"adv/std_step_conf": 0.9344663619995117,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6917724765935616,
"calib/avg_num_step_conf": 5.609375,
"calib/ece": 0.2707171314741035,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.549800796812749,
"calib/gap": 0.16669039374118255,
"calib/mean_conf": 0.7711155378486056,
"calib/mu_c": 0.8461594202898551,
"calib/mu_w": 0.6794690265486726,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.24601593625497997,
"calib/std_conf": 0.28485886384028153,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4204925373134329,
"calib/step_q_c_n": 670.0,
"calib/step_q_gap": 0.08605389501578276,
"calib/step_q_w": 0.33443864229765013,
"calib/step_q_w_n": 766.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2827.0,
"completions/max_terminated_length": 2827.0,
"completions/mean_length": 522.87109375,
"completions/mean_terminated_length": 522.87109375,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.030360069125890732,
"kl": 0.054576873779296875,
"learning_rate": 3.5833333333333335e-06,
"loss": 0.0097,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03734172135591507,
"mask/share_reasoning": 0.8406904935836792,
"mask/share_step_conf": 0.12196780741214752,
"num_tokens": 16701063.0,
"reward": 0.9102877378463745,
"reward_std": 0.18750911951065063,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6886539459228516,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8287965059280396,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.7164885997772217,
"adv/mean_abs_reasoning": 0.4583200514316559,
"adv/mean_abs_step_conf": 0.767371654510498,
"adv/ratio_final_to_reasoning": 1.5632931562542896,
"adv/ratio_step_to_reasoning": 1.6743139474554007,
"adv/std_final_conf": 0.9001376032829285,
"adv/std_reasoning": 0.7392032146453857,
"adv/std_step_conf": 0.9336949586868286,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7175264713124847,
"calib/avg_num_step_conf": 4.890625,
"calib/ece": 0.2699084967320261,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5137254901960784,
"calib/gap": 0.188356521382254,
"calib/mean_conf": 0.7594248366013073,
"calib/mu_c": 0.8510178117048347,
"calib/mu_w": 0.6626612903225807,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2578039215686274,
"calib/std_conf": 0.28468599893357543,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.44402117834394894,
"calib/step_q_c_n": 628.0,
"calib/step_q_gap": 0.07713015270292328,
"calib/step_q_w": 0.36689102564102566,
"calib/step_q_w_n": 624.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1704.0,
"completions/max_terminated_length": 1704.0,
"completions/mean_length": 468.11328125,
"completions/mean_terminated_length": 468.11328125,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.0768,
"grad_norm": 0.05988554283976555,
"kl": 0.3415679931640625,
"learning_rate": 3.555555555555556e-06,
"loss": -0.0079,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03583850711584091,
"mask/share_reasoning": 0.8487659692764282,
"mask/share_step_conf": 0.11539548635482788,
"num_tokens": 16925308.0,
"reward": 0.9356938600540161,
"reward_std": 0.16360831260681152,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7001357674598694,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8696893453598022,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.7666511535644531,
"adv/mean_abs_reasoning": 0.614189624786377,
"adv/mean_abs_step_conf": 0.7640584707260132,
"adv/ratio_final_to_reasoning": 1.2482320160180242,
"adv/ratio_step_to_reasoning": 1.2440107092199133,
"adv/std_final_conf": 0.9226399660110474,
"adv/std_reasoning": 0.8266867399215698,
"adv/std_step_conf": 0.9349403977394104,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7521962937542895,
"calib/avg_num_step_conf": 5.10546875,
"calib/ece": 0.1332530120481928,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.42168674698795183,
"calib/gap": 0.2640789293067949,
"calib/mean_conf": 0.7317269076305221,
"calib/mu_c": 0.8314193548387097,
"calib/mu_w": 0.5673404255319148,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.12124497991967871,
"calib/std_conf": 0.2826875993640548,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.4355613422818792,
"calib/step_q_c_n": 745.0,
"calib/step_q_gap": 0.06387095082280447,
"calib/step_q_w": 0.37169039145907473,
"calib/step_q_w_n": 562.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3070.0,
"completions/max_terminated_length": 3070.0,
"completions/mean_length": 489.25,
"completions/mean_terminated_length": 491.1686706542969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.044797178357839584,
"kl": 0.0715484619140625,
"learning_rate": 3.5277777777777784e-06,
"loss": -0.0412,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03444898873567581,
"mask/share_reasoning": 0.8487522006034851,
"mask/share_step_conf": 0.11289255321025848,
"num_tokens": 17157588.0,
"reward": 0.9419361352920532,
"reward_std": 0.21368886530399323,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7651835680007935,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8054074048995972,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.7724243998527527,
"adv/mean_abs_reasoning": 0.599509596824646,
"adv/mean_abs_step_conf": 0.7545539736747742,
"adv/ratio_final_to_reasoning": 1.288427080974124,
"adv/ratio_step_to_reasoning": 1.258618673781594,
"adv/std_final_conf": 0.9349808692932129,
"adv/std_reasoning": 0.8099501132965088,
"adv/std_step_conf": 0.9347738027572632,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7580335254753859,
"calib/avg_num_step_conf": 4.90234375,
"calib/ece": 0.10906504065040648,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.2073170731707317,
"calib/gap": 0.28326972768833225,
"calib/mean_conf": 0.5492276422764228,
"calib/mu_c": 0.6839534883720929,
"calib/mu_w": 0.40068376068376066,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.0669512195121951,
"calib/std_conf": 0.31374704515197527,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.4218228279386712,
"calib/step_q_c_n": 587.0,
"calib/step_q_gap": 0.10288570219016818,
"calib/step_q_w": 0.318937125748503,
"calib/step_q_w_n": 668.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2676.0,
"completions/max_terminated_length": 2676.0,
"completions/mean_length": 480.95703125,
"completions/mean_terminated_length": 488.59130859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.03414783999323845,
"kl": 0.08736419677734375,
"learning_rate": 3.5e-06,
"loss": -0.0599,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03647315502166748,
"mask/share_reasoning": 0.8328334093093872,
"mask/share_step_conf": 0.11506839096546173,
"num_tokens": 17384641.0,
"reward": 0.9393357634544373,
"reward_std": 0.17300420999526978,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7562761306762695,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8309890031814575,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.7449522614479065,
"adv/mean_abs_reasoning": 0.4977647066116333,
"adv/mean_abs_step_conf": 0.7689626812934875,
"adv/ratio_final_to_reasoning": 1.4965951815244591,
"adv/ratio_step_to_reasoning": 1.5448316666079918,
"adv/std_final_conf": 0.9269348978996277,
"adv/std_reasoning": 0.7752844095230103,
"adv/std_step_conf": 0.934822142124176,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7669344675488343,
"calib/avg_num_step_conf": 4.8828125,
"calib/ece": 0.10792094861660081,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.40711462450592883,
"calib/gap": 0.30033043478260874,
"calib/mean_conf": 0.6924743083003954,
"calib/mu_c": 0.7743826086956522,
"calib/mu_w": 0.4740521739130435,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.03656126482213439,
"calib/std_conf": 0.3128283584172879,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.43616800920598386,
"calib/step_q_c_n": 869.0,
"calib/step_q_gap": 0.08076118505900226,
"calib/step_q_w": 0.3554068241469816,
"calib/step_q_w_n": 381.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3023.0,
"completions/max_terminated_length": 3023.0,
"completions/mean_length": 431.88671875,
"completions/mean_terminated_length": 433.5804138183594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.08,
"grad_norm": 0.08078334480524063,
"kl": 0.092681884765625,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.0185,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.038396649062633514,
"mask/share_reasoning": 0.8343999981880188,
"mask/share_step_conf": 0.12329712510108948,
"num_tokens": 17599956.0,
"reward": 0.9955003261566162,
"reward_std": 0.16879957914352417,
"rewards/accuracy_reward_step": 0.71875,
"rewards/final_brier_reward_step": 0.809590220451355,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8407853841781616,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.7436815500259399,
"adv/mean_abs_reasoning": 0.5508506298065186,
"adv/mean_abs_step_conf": 0.7598429918289185,
"adv/ratio_final_to_reasoning": 1.3500602700357291,
"adv/ratio_step_to_reasoning": 1.3793993338916697,
"adv/std_final_conf": 0.9284547567367554,
"adv/std_reasoning": 0.8097205758094788,
"adv/std_step_conf": 0.934494137763977,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.704641812865497,
"calib/avg_num_step_conf": 4.2890625,
"calib/ece": 0.16167729083665333,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.3665338645418327,
"calib/gap": 0.22810891812865497,
"calib/mean_conf": 0.6665298804780877,
"calib/mu_c": 0.739233918128655,
"calib/mu_w": 0.511125,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.07346613545816727,
"calib/std_conf": 0.31771581758015005,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4496159122085048,
"calib/step_q_c_n": 729.0,
"calib/step_q_gap": 0.06660778212720392,
"calib/step_q_w": 0.38300813008130086,
"calib/step_q_w_n": 369.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2252.0,
"completions/max_terminated_length": 2252.0,
"completions/mean_length": 469.73828125,
"completions/mean_terminated_length": 473.43701171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.03808128088712692,
"kl": 0.091583251953125,
"learning_rate": 3.444444444444445e-06,
"loss": -0.0494,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03749001398682594,
"mask/share_reasoning": 0.8472702503204346,
"mask/share_step_conf": 0.10742717236280441,
"num_tokens": 17823265.0,
"reward": 0.9695121049880981,
"reward_std": 0.16438232362270355,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7655134201049805,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8438230752944946,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.7754688858985901,
"adv/mean_abs_reasoning": 0.46595898270606995,
"adv/mean_abs_step_conf": 0.7579265236854553,
"adv/ratio_final_to_reasoning": 1.66424280822967,
"adv/ratio_step_to_reasoning": 1.626594940361007,
"adv/std_final_conf": 0.9346413612365723,
"adv/std_reasoning": 0.7393398284912109,
"adv/std_step_conf": 0.9346475005149841,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6738372919926041,
"calib/avg_num_step_conf": 5.04296875,
"calib/ece": 0.15510121457489887,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.3441295546558704,
"calib/gap": 0.1813938273360829,
"calib/mean_conf": 0.6770445344129554,
"calib/mu_c": 0.7424050632911392,
"calib/mu_w": 0.5610112359550563,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.09623481781376528,
"calib/std_conf": 0.2917498686733845,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.42880299319727894,
"calib/step_q_c_n": 735.0,
"calib/step_q_gap": 0.09964831693828613,
"calib/step_q_w": 0.3291546762589928,
"calib/step_q_w_n": 556.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2741.0,
"completions/max_terminated_length": 2741.0,
"completions/mean_length": 484.9921875,
"completions/mean_terminated_length": 490.74310302734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.054215505719184875,
"kl": 0.089569091796875,
"learning_rate": 3.416666666666667e-06,
"loss": -0.054,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.040008001029491425,
"mask/share_reasoning": 0.8277705907821655,
"mask/share_step_conf": 0.12050262093544006,
"num_tokens": 18052087.0,
"reward": 0.9375009536743164,
"reward_std": 0.18988294899463654,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7365831732749939,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8227936029434204,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.7341655492782593,
"adv/mean_abs_reasoning": 0.5082210898399353,
"adv/mean_abs_step_conf": 0.728459358215332,
"adv/ratio_final_to_reasoning": 1.4445790699269985,
"adv/ratio_step_to_reasoning": 1.4333512968632627,
"adv/std_final_conf": 0.9238205552101135,
"adv/std_reasoning": 0.7927084565162659,
"adv/std_step_conf": 0.9343314170837402,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7181290024427279,
"calib/avg_num_step_conf": 4.9375,
"calib/ece": 0.21872089947089954,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5674603174603174,
"calib/gap": 0.21723846306199257,
"calib/mean_conf": 0.7611203703703704,
"calib/mu_c": 0.8464640522875818,
"calib/mu_w": 0.6292255892255892,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.18634920634920643,
"calib/std_conf": 0.31027963019646326,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.44736795212765956,
"calib/step_q_c_n": 752.0,
"calib/step_q_gap": 0.05672107712765956,
"calib/step_q_w": 0.390646875,
"calib/step_q_w_n": 512.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1540.0,
"completions/max_terminated_length": 1540.0,
"completions/mean_length": 497.40234375,
"completions/mean_terminated_length": 501.31890869140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.0832,
"grad_norm": 0.03283295780420303,
"kl": 0.0950469970703125,
"learning_rate": 3.3888888888888893e-06,
"loss": -0.0921,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.033064231276512146,
"mask/share_reasoning": 0.8525799512863159,
"mask/share_step_conf": 0.10654333233833313,
"num_tokens": 18287446.0,
"reward": 0.945235013961792,
"reward_std": 0.19811129570007324,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7295798063278198,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8444838523864746,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.6889576315879822,
"adv/mean_abs_reasoning": 0.46892887353897095,
"adv/mean_abs_step_conf": 0.7485886812210083,
"adv/ratio_final_to_reasoning": 1.469215632614965,
"adv/ratio_step_to_reasoning": 1.5963800129675654,
"adv/std_final_conf": 0.88222736120224,
"adv/std_reasoning": 0.7206056118011475,
"adv/std_step_conf": 0.9345911145210266,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6310708598726115,
"calib/avg_num_step_conf": 5.09765625,
"calib/ece": 0.3037146245059287,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6837944664031621,
"calib/gap": 0.09512561040339707,
"calib/mean_conf": 0.8567596837944664,
"calib/mu_c": 0.8928547770700637,
"calib/mu_w": 0.7977291666666666,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2699604743083003,
"calib/std_conf": 0.24993430622062862,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4742439644218552,
"calib/step_q_c_n": 787.0,
"calib/step_q_gap": 0.06760303778092852,
"calib/step_q_w": 0.40664092664092666,
"calib/step_q_w_n": 518.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2703.0,
"completions/max_terminated_length": 2703.0,
"completions/mean_length": 509.5078125,
"completions/mean_terminated_length": 513.5196533203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.0327574759721756,
"kl": 0.08077239990234375,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0964,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03413655236363411,
"mask/share_reasoning": 0.8568128943443298,
"mask/share_step_conf": 0.10123801231384277,
"num_tokens": 18524256.0,
"reward": 0.9247376918792725,
"reward_std": 0.19518733024597168,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6796905994415283,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8502533435821533,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.6469038128852844,
"adv/mean_abs_reasoning": 0.5513267517089844,
"adv/mean_abs_step_conf": 0.7641560435295105,
"adv/ratio_final_to_reasoning": 1.1733582868598946,
"adv/ratio_step_to_reasoning": 1.386031135185813,
"adv/std_final_conf": 0.8788886666297913,
"adv/std_reasoning": 0.7927740216255188,
"adv/std_step_conf": 0.9344629049301147,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6631141618497111,
"calib/avg_num_step_conf": 5.50390625,
"calib/ece": 0.2358517786561264,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.8458498023715415,
"calib/gap": 0.19351343208092486,
"calib/mean_conf": 0.9094446640316206,
"calib/mu_c": 0.970634682080925,
"calib/mu_w": 0.7771212500000001,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2307509881422924,
"calib/std_conf": 0.23273399517701132,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4987823275862069,
"calib/step_q_c_n": 928.0,
"calib/step_q_gap": 0.07834511344899275,
"calib/step_q_w": 0.42043721413721413,
"calib/step_q_w_n": 481.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2738.0,
"completions/max_terminated_length": 2738.0,
"completions/mean_length": 475.51171875,
"completions/mean_terminated_length": 475.51171875,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.025196747854351997,
"kl": 0.09796142578125,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.0351,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.037030771374702454,
"mask/share_reasoning": 0.8361947536468506,
"mask/share_step_conf": 0.12677444517612457,
"num_tokens": 18748147.0,
"reward": 0.960274875164032,
"reward_std": 0.21059617400169373,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7495396733283997,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8389787673950195,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.6059747934341431,
"adv/mean_abs_reasoning": 0.4931219816207886,
"adv/mean_abs_step_conf": 0.7429808378219604,
"adv/ratio_final_to_reasoning": 1.228853744143449,
"adv/ratio_step_to_reasoning": 1.5066877274055765,
"adv/std_final_conf": 0.8232892751693726,
"adv/std_reasoning": 0.7575910687446594,
"adv/std_step_conf": 0.934981644153595,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7076734811402247,
"calib/avg_num_step_conf": 5.24609375,
"calib/ece": 0.2738987654320988,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.7901234567901234,
"calib/gap": 0.2289561762165273,
"calib/mean_conf": 0.8807514403292181,
"calib/mu_c": 0.9674344370860927,
"calib/mu_w": 0.7384782608695654,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.2666255144032922,
"calib/std_conf": 0.2550425248734223,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5346244477172312,
"calib/step_q_c_n": 679.0,
"calib/step_q_gap": 0.15555818265699023,
"calib/step_q_w": 0.37906626506024094,
"calib/step_q_w_n": 664.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3014.0,
"completions/max_terminated_length": 3014.0,
"completions/mean_length": 503.9296875,
"completions/mean_terminated_length": 513.9681396484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.0864,
"grad_norm": 0.023068297654390335,
"kl": 0.0867462158203125,
"learning_rate": 3.3055555555555558e-06,
"loss": -0.117,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.0342206209897995,
"mask/share_reasoning": 0.8357434868812561,
"mask/share_step_conf": 0.11050460487604141,
"num_tokens": 18983401.0,
"reward": 0.9053604602813721,
"reward_std": 0.22578772902488708,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6947583556175232,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.8089312314987183,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.614736020565033,
"adv/mean_abs_reasoning": 0.5235514044761658,
"adv/mean_abs_step_conf": 0.7543563842773438,
"adv/ratio_final_to_reasoning": 1.174165545750185,
"adv/ratio_step_to_reasoning": 1.4408449253079698,
"adv/std_final_conf": 0.8214467763900757,
"adv/std_reasoning": 0.7752867937088013,
"adv/std_step_conf": 0.9348330497741699,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6453923593185338,
"calib/avg_num_step_conf": 4.890625,
"calib/ece": 0.3333201581027669,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.83399209486166,
"calib/gap": 0.1542010841507485,
"calib/mean_conf": 0.9102371541501976,
"calib/mu_c": 0.9736241610738254,
"calib/mu_w": 0.8194230769230769,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.32731225296442695,
"calib/std_conf": 0.2233431377872269,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5443872919818458,
"calib/step_q_c_n": 661.0,
"calib/step_q_gap": 0.1393788317449592,
"calib/step_q_w": 0.4050084602368866,
"calib/step_q_w_n": 591.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2514.0,
"completions/max_terminated_length": 2514.0,
"completions/mean_length": 448.171875,
"completions/mean_terminated_length": 449.929443359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.048648640513420105,
"kl": 0.0974578857421875,
"learning_rate": 3.277777777777778e-06,
"loss": -0.0345,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03892158344388008,
"mask/share_reasoning": 0.8418777585029602,
"mask/share_step_conf": 0.11529439687728882,
"num_tokens": 19203685.0,
"reward": 0.9137436747550964,
"reward_std": 0.21004939079284668,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6714894771575928,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8419353365898132,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.6282051205635071,
"adv/mean_abs_reasoning": 0.48869287967681885,
"adv/mean_abs_step_conf": 0.7447449564933777,
"adv/ratio_final_to_reasoning": 1.2854804043368713,
"adv/ratio_step_to_reasoning": 1.5239529517718584,
"adv/std_final_conf": 0.8340749144554138,
"adv/std_reasoning": 0.7394261360168457,
"adv/std_step_conf": 0.9344735145568848,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6325115562403698,
"calib/avg_num_step_conf": 4.62109375,
"calib/ece": 0.36741999999999986,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.772,
"calib/gap": 0.1394003595274782,
"calib/mean_conf": 0.8589,
"calib/mu_c": 0.9246969696969697,
"calib/mu_w": 0.7852966101694915,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.34915999999999986,
"calib/std_conf": 0.2897531535635117,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5318982387475538,
"calib/step_q_c_n": 511.0,
"calib/step_q_gap": 0.11579704827136333,
"calib/step_q_w": 0.4161011904761905,
"calib/step_q_w_n": 672.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1577.0,
"completions/max_terminated_length": 1577.0,
"completions/mean_length": 510.96484375,
"completions/mean_terminated_length": 514.9881591796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.033095818012952805,
"kl": 0.09009552001953125,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.1238,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03497224301099777,
"mask/share_reasoning": 0.8547062277793884,
"mask/share_step_conf": 0.1025090143084526,
"num_tokens": 19441756.0,
"reward": 0.8709297180175781,
"reward_std": 0.22499291598796844,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6079858541488647,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8369985818862915,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.7021284699440002,
"adv/mean_abs_reasoning": 0.5409032702445984,
"adv/mean_abs_step_conf": 0.7552073001861572,
"adv/ratio_final_to_reasoning": 1.298066601864868,
"adv/ratio_step_to_reasoning": 1.3961965876905307,
"adv/std_final_conf": 0.8890236020088196,
"adv/std_reasoning": 0.7927778363227844,
"adv/std_step_conf": 0.934401273727417,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7623575361762068,
"calib/avg_num_step_conf": 4.4140625,
"calib/ece": 0.3165027888446216,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.7370517928286853,
"calib/gap": 0.24678008707901156,
"calib/mean_conf": 0.8535370517928287,
"calib/mu_c": 0.9656204379562046,
"calib/mu_w": 0.718840350877193,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.31211155378486066,
"calib/std_conf": 0.27613141860409884,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5193537964458805,
"calib/step_q_c_n": 619.0,
"calib/step_q_gap": 0.08021485319734822,
"calib/step_q_w": 0.4391389432485323,
"calib/step_q_w_n": 511.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2868.0,
"completions/max_terminated_length": 2868.0,
"completions/mean_length": 447.01171875,
"completions/mean_terminated_length": 447.01171875,
"completions/min_length": 126.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.0896,
"grad_norm": 0.04072084650397301,
"kl": 0.0982513427734375,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.002,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03983251377940178,
"mask/share_reasoning": 0.8514313697814941,
"mask/share_step_conf": 0.10873612016439438,
"num_tokens": 19662111.0,
"reward": 0.9126341342926025,
"reward_std": 0.22445017099380493,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6862176060676575,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8367067575454712,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.6004186868667603,
"adv/mean_abs_reasoning": 0.457645446062088,
"adv/mean_abs_step_conf": 0.7367417812347412,
"adv/ratio_final_to_reasoning": 1.3119734764831517,
"adv/ratio_step_to_reasoning": 1.6098527529863995,
"adv/std_final_conf": 0.8238394856452942,
"adv/std_reasoning": 0.7392846941947937,
"adv/std_step_conf": 0.9346275329589844,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7251965923984273,
"calib/avg_num_step_conf": 5.1796875,
"calib/ece": 0.28236947791164657,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.7028112449799196,
"calib/gap": 0.26753211009174327,
"calib/mean_conf": 0.8218875502008033,
"calib/mu_c": 0.9389999999999998,
"calib/mu_w": 0.6714678899082566,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.27100401606425706,
"calib/std_conf": 0.300692726664778,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.45204316546762585,
"calib/step_q_c_n": 695.0,
"calib/step_q_gap": 0.08286725421564489,
"calib/step_q_w": 0.36917591125198096,
"calib/step_q_w_n": 631.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2873.0,
"completions/max_terminated_length": 2873.0,
"completions/mean_length": 530.91015625,
"completions/mean_terminated_length": 537.20556640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.029364172369241714,
"kl": 0.079315185546875,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.1209,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.035193197429180145,
"mask/share_reasoning": 0.8402204513549805,
"mask/share_step_conf": 0.11286762356758118,
"num_tokens": 19905848.0,
"reward": 0.910934329032898,
"reward_std": 0.23398302495479584,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6924851536750793,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.8309459686279297,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.7090624570846558,
"adv/mean_abs_reasoning": 0.5809318423271179,
"adv/mean_abs_step_conf": 0.7425640821456909,
"adv/ratio_final_to_reasoning": 1.2205604950905558,
"adv/ratio_step_to_reasoning": 1.2782292655384506,
"adv/std_final_conf": 0.9075257778167725,
"adv/std_reasoning": 0.8099024891853333,
"adv/std_step_conf": 0.9343748092651367,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.773311491935484,
"calib/avg_num_step_conf": 4.78515625,
"calib/ece": 0.21140476190476187,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5158730158730159,
"calib/gap": 0.3180498991935483,
"calib/mean_conf": 0.715452380952381,
"calib/mu_c": 0.871953125,
"calib/mu_w": 0.5539032258064517,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.20946031746031743,
"calib/std_conf": 0.3358017202926065,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.45,
"calib/step_q_c_n": 604.0,
"calib/step_q_gap": 0.07099339774557167,
"calib/step_q_w": 0.37900660225442834,
"calib/step_q_w_n": 621.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1889.0,
"completions/max_terminated_length": 1889.0,
"completions/mean_length": 506.765625,
"completions/mean_terminated_length": 508.7529602050781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.027637803927063942,
"kl": 0.08685302734375,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.1019,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.035529859364032745,
"mask/share_reasoning": 0.8527402877807617,
"mask/share_step_conf": 0.10782356560230255,
"num_tokens": 20141092.0,
"reward": 0.9500056505203247,
"reward_std": 0.20182755589485168,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.7396859526634216,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8642314672470093,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.632361888885498,
"adv/mean_abs_reasoning": 0.47548264265060425,
"adv/mean_abs_step_conf": 0.757982611656189,
"adv/ratio_final_to_reasoning": 1.3299368518698427,
"adv/ratio_step_to_reasoning": 1.5941330842925685,
"adv/std_final_conf": 0.8701613545417786,
"adv/std_reasoning": 0.7393690347671509,
"adv/std_step_conf": 0.9344350099563599,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7125527426160339,
"calib/avg_num_step_conf": 4.5703125,
"calib/ece": 0.18223346828609988,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6639676113360324,
"calib/gap": 0.22428985834840287,
"calib/mean_conf": 0.8303171390013495,
"calib/mu_c": 0.9020535714285716,
"calib/mu_w": 0.6777637130801687,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16619433198380568,
"calib/std_conf": 0.274614031204716,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.493969571230982,
"calib/step_q_c_n": 723.0,
"calib/step_q_gap": 0.10150872111912518,
"calib/step_q_w": 0.39246085011185683,
"calib/step_q_w_n": 447.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3028.0,
"completions/max_terminated_length": 3028.0,
"completions/mean_length": 418.453125,
"completions/mean_terminated_length": 426.7888488769531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.0928,
"grad_norm": 0.029977787286043167,
"kl": 0.09900665283203125,
"learning_rate": 3.138888888888889e-06,
"loss": -0.0592,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.04178471863269806,
"mask/share_reasoning": 0.8263924717903137,
"mask/share_step_conf": 0.11229157447814941,
"num_tokens": 20353712.0,
"reward": 0.9529500603675842,
"reward_std": 0.19986681640148163,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7545884847640991,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8263115286827087,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.6885051727294922,
"adv/mean_abs_reasoning": 0.5059357285499573,
"adv/mean_abs_step_conf": 0.7486658096313477,
"adv/ratio_final_to_reasoning": 1.3608550135464639,
"adv/ratio_step_to_reasoning": 1.4797646566236182,
"adv/std_final_conf": 0.8923454284667969,
"adv/std_reasoning": 0.757586658000946,
"adv/std_step_conf": 0.9346413016319275,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8414918414918414,
"calib/avg_num_step_conf": 5.01171875,
"calib/ece": 0.12470588235294121,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5490196078431373,
"calib/gap": 0.42314296814296787,
"calib/mean_conf": 0.7281568627450979,
"calib/mu_c": 0.8924358974358972,
"calib/mu_w": 0.4692929292929294,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.12054901960784317,
"calib/std_conf": 0.3348949716862674,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4505395683453237,
"calib/step_q_c_n": 834.0,
"calib/step_q_gap": 0.08880237458140383,
"calib/step_q_w": 0.3617371937639199,
"calib/step_q_w_n": 449.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1680.0,
"completions/max_terminated_length": 1680.0,
"completions/mean_length": 485.25,
"completions/mean_terminated_length": 487.1529846191406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.03266888111829758,
"kl": 0.08789825439453125,
"learning_rate": 3.1111111111111116e-06,
"loss": -0.1147,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.035181909799575806,
"mask/share_reasoning": 0.8508784770965576,
"mask/share_step_conf": 0.11003339290618896,
"num_tokens": 20587784.0,
"reward": 0.9882926940917969,
"reward_std": 0.17865976691246033,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.8217456936836243,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8368707895278931,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.6719787120819092,
"adv/mean_abs_reasoning": 0.49249130487442017,
"adv/mean_abs_step_conf": 0.7202622294425964,
"adv/ratio_final_to_reasoning": 1.3644478703096217,
"adv/ratio_step_to_reasoning": 1.462487199903469,
"adv/std_final_conf": 0.8897674083709717,
"adv/std_reasoning": 0.7575967907905579,
"adv/std_step_conf": 0.9350314736366272,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7737520661157025,
"calib/avg_num_step_conf": 4.375,
"calib/ece": 0.178780487804878,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.45121951219512196,
"calib/gap": 0.36873256198347104,
"calib/mean_conf": 0.6444715447154471,
"calib/mu_c": 0.82584,
"calib/mu_w": 0.457107438016529,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.15756097560975607,
"calib/std_conf": 0.3605882800742173,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.47622907662082514,
"calib/step_q_c_n": 509.0,
"calib/step_q_gap": 0.12298848742278912,
"calib/step_q_w": 0.353240589198036,
"calib/step_q_w_n": 611.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3022.0,
"completions/max_terminated_length": 3022.0,
"completions/mean_length": 555.4765625,
"completions/mean_terminated_length": 555.4765625,
"completions/min_length": 134.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.02888135053217411,
"kl": 0.08712005615234375,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.0078,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.038115061819553375,
"mask/share_reasoning": 0.8615081310272217,
"mask/share_step_conf": 0.10037682950496674,
"num_tokens": 20838874.0,
"reward": 0.9215790629386902,
"reward_std": 0.21322081983089447,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7409582138061523,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.8154811859130859,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.6734800338745117,
"adv/mean_abs_reasoning": 0.4882771372795105,
"adv/mean_abs_step_conf": 0.7472188472747803,
"adv/ratio_final_to_reasoning": 1.3792987270034378,
"adv/ratio_step_to_reasoning": 1.530317088852433,
"adv/std_final_conf": 0.8759849667549133,
"adv/std_reasoning": 0.7575928568840027,
"adv/std_step_conf": 0.9344674944877625,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6883633178357254,
"calib/avg_num_step_conf": 5.38671875,
"calib/ece": 0.21848000000000006,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.56,
"calib/gap": 0.24580149585607447,
"calib/mean_conf": 0.69208,
"calib/mu_c": 0.7874509803921569,
"calib/mu_w": 0.5416494845360824,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14928000000000005,
"calib/std_conf": 0.3698741321044228,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4144972972972973,
"calib/step_q_c_n": 777.0,
"calib/step_q_gap": 0.06046075244679894,
"calib/step_q_w": 0.35403654485049835,
"calib/step_q_w_n": 602.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2434.0,
"completions/max_terminated_length": 2434.0,
"completions/mean_length": 505.44921875,
"completions/mean_terminated_length": 505.44921875,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.096,
"grad_norm": 0.02644607611000538,
"kl": 0.083831787109375,
"learning_rate": 3.055555555555556e-06,
"loss": 0.0431,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.035364821553230286,
"mask/share_reasoning": 0.845434308052063,
"mask/share_step_conf": 0.11920082569122314,
"num_tokens": 21071589.0,
"reward": 0.9392973780632019,
"reward_std": 0.2013944387435913,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.71880704164505,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8449438810348511,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.7215551137924194,
"adv/mean_abs_reasoning": 0.5888091325759888,
"adv/mean_abs_step_conf": 0.7760694622993469,
"adv/ratio_final_to_reasoning": 1.225448237590471,
"adv/ratio_step_to_reasoning": 1.3180323119380137,
"adv/std_final_conf": 0.9045106768608093,
"adv/std_reasoning": 0.8099533915519714,
"adv/std_step_conf": 0.934012770652771,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7291061046511628,
"calib/avg_num_step_conf": 5.203125,
"calib/ece": 0.18215447154471542,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.43902439024390244,
"calib/gap": 0.29467005813953484,
"calib/mean_conf": 0.6414227642276422,
"calib/mu_c": 0.7444375,
"calib/mu_w": 0.44976744186046513,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.08658536585365852,
"calib/std_conf": 0.3549338770007247,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.41673768308921444,
"calib/step_q_c_n": 751.0,
"calib/step_q_gap": 0.10808019599799246,
"calib/step_q_w": 0.308657487091222,
"calib/step_q_w_n": 581.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2746.0,
"completions/max_terminated_length": 2746.0,
"completions/mean_length": 513.13671875,
"completions/mean_terminated_length": 519.2213745117188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.040382903069257736,
"kl": 0.08388519287109375,
"learning_rate": 3.0277777777777776e-06,
"loss": -0.0624,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03338609263300896,
"mask/share_reasoning": 0.8500442504882812,
"mask/share_step_conf": 0.1048509031534195,
"num_tokens": 21310664.0,
"reward": 0.9495749473571777,
"reward_std": 0.18228210508823395,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7500753998756409,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8318870663642883,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.7286385297775269,
"adv/mean_abs_reasoning": 0.46878039836883545,
"adv/mean_abs_step_conf": 0.7632089257240295,
"adv/ratio_final_to_reasoning": 1.5543280655780227,
"adv/ratio_step_to_reasoning": 1.6280734612191237,
"adv/std_final_conf": 0.9158452153205872,
"adv/std_reasoning": 0.7574487924575806,
"adv/std_step_conf": 0.9339590072631836,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7326384188439149,
"calib/avg_num_step_conf": 4.7109375,
"calib/ece": 0.14370823529411766,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.45098039215686275,
"calib/gap": 0.29875386489779354,
"calib/mean_conf": 0.6746447058823529,
"calib/mu_c": 0.7789156626506025,
"calib/mu_w": 0.480161797752809,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08368627450980395,
"calib/std_conf": 0.3472926672800365,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4442895442359249,
"calib/step_q_c_n": 746.0,
"calib/step_q_gap": 0.09637650075766402,
"calib/step_q_w": 0.3479130434782609,
"calib/step_q_w_n": 460.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2936.0,
"completions/max_terminated_length": 2936.0,
"completions/mean_length": 443.125,
"completions/mean_terminated_length": 443.125,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.0689389705657959,
"kl": 0.08803558349609375,
"learning_rate": 3e-06,
"loss": -0.0926,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03953507915139198,
"mask/share_reasoning": 0.843379557132721,
"mask/share_step_conf": 0.11708534508943558,
"num_tokens": 21530824.0,
"reward": 0.9877589344978333,
"reward_std": 0.16079741716384888,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7843038439750671,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8630890250205994,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.6992151737213135,
"adv/mean_abs_reasoning": 0.4734675884246826,
"adv/mean_abs_step_conf": 0.7822864055633545,
"adv/ratio_final_to_reasoning": 1.4767962809191149,
"adv/ratio_step_to_reasoning": 1.6522491183951393,
"adv/std_final_conf": 0.8908438086509705,
"adv/std_reasoning": 0.7207232713699341,
"adv/std_step_conf": 0.9336625337600708,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.710665362035225,
"calib/avg_num_step_conf": 6.00390625,
"calib/ece": 0.15475431606905712,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.43824701195219123,
"calib/gap": 0.2935055446836268,
"calib/mean_conf": 0.6632005312084992,
"calib/mu_c": 0.7859817351598173,
"calib/mu_w": 0.4924761904761905,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11814077025232407,
"calib/std_conf": 0.3554428247470627,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4403506493506493,
"calib/step_q_c_n": 770.0,
"calib/step_q_gap": 0.08889041467007569,
"calib/step_q_w": 0.35146023468057364,
"calib/step_q_w_n": 767.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2697.0,
"completions/max_terminated_length": 2697.0,
"completions/mean_length": 521.6328125,
"completions/mean_terminated_length": 523.678466796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.0992,
"grad_norm": 0.046443577855825424,
"kl": 0.0834197998046875,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.0024,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.0352531261742115,
"mask/share_reasoning": 0.8352970480918884,
"mask/share_step_conf": 0.12554356455802917,
"num_tokens": 21770138.0,
"reward": 0.9602721333503723,
"reward_std": 0.1766793429851532,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7515501976013184,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8588377237319946,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.6296205520629883,
"adv/mean_abs_reasoning": 0.4622064232826233,
"adv/mean_abs_step_conf": 0.7736461758613586,
"adv/ratio_final_to_reasoning": 1.3622064089706452,
"adv/ratio_step_to_reasoning": 1.6738109573788866,
"adv/std_final_conf": 0.859623908996582,
"adv/std_reasoning": 0.7392425537109375,
"adv/std_step_conf": 0.9339349269866943,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8167848699763592,
"calib/avg_num_step_conf": 5.046875,
"calib/ece": 0.1941666666666667,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5277777777777778,
"calib/gap": 0.3569963580601878,
"calib/mean_conf": 0.7235317460317461,
"calib/mu_c": 0.8807801418439716,
"calib/mu_w": 0.5237837837837838,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.17908730158730163,
"calib/std_conf": 0.33992854493092534,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5006395348837208,
"calib/step_q_c_n": 688.0,
"calib/step_q_gap": 0.1298448328969658,
"calib/step_q_w": 0.370794701986755,
"calib/step_q_w_n": 604.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2404.0,
"completions/max_terminated_length": 2404.0,
"completions/mean_length": 441.47265625,
"completions/mean_terminated_length": 444.9488220214844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.03776202350854874,
"kl": 0.0876617431640625,
"learning_rate": 2.944444444444445e-06,
"loss": -0.0469,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.037618488073349,
"mask/share_reasoning": 0.8373420238494873,
"mask/share_step_conf": 0.1172269657254219,
"num_tokens": 21991835.0,
"reward": 0.9584506750106812,
"reward_std": 0.16993385553359985,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7676078081130981,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8438247442245483,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.5176414847373962,
"adv/mean_abs_reasoning": 0.449894517660141,
"adv/mean_abs_step_conf": 0.766070544719696,
"adv/ratio_final_to_reasoning": 1.1505841134264112,
"adv/ratio_step_to_reasoning": 1.7027781283131807,
"adv/std_final_conf": 0.7770000100135803,
"adv/std_reasoning": 0.7205990552902222,
"adv/std_step_conf": 0.9334955215454102,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7958348556814031,
"calib/avg_num_step_conf": 4.93359375,
"calib/ece": 0.17776422764227634,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.7113821138211383,
"calib/gap": 0.3479378881987578,
"calib/mean_conf": 0.8097154471544715,
"calib/mu_c": 0.9299378881987578,
"calib/mu_w": 0.582,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.16650406504065032,
"calib/std_conf": 0.32364715355659757,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.49762077294685986,
"calib/step_q_c_n": 828.0,
"calib/step_q_gap": 0.10221847409628515,
"calib/step_q_w": 0.3954022988505747,
"calib/step_q_w_n": 435.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2964.0,
"completions/max_terminated_length": 2964.0,
"completions/mean_length": 475.5625,
"completions/mean_terminated_length": 483.11114501953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.030413884669542313,
"kl": 0.073638916015625,
"learning_rate": 2.916666666666667e-06,
"loss": -0.1127,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.036571212112903595,
"mask/share_reasoning": 0.8310877680778503,
"mask/share_step_conf": 0.11671602725982666,
"num_tokens": 22219707.0,
"reward": 0.9627537727355957,
"reward_std": 0.1842033863067627,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7710347771644592,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8365039825439453,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.45578479766845703,
"adv/mean_abs_reasoning": 0.3871430456638336,
"adv/mean_abs_step_conf": 0.7665640115737915,
"adv/ratio_final_to_reasoning": 1.1773033321234623,
"adv/ratio_step_to_reasoning": 1.9800536782453764,
"adv/std_final_conf": 0.7365880012512207,
"adv/std_reasoning": 0.6611899733543396,
"adv/std_step_conf": 0.9334518909454346,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7475511695906434,
"calib/avg_num_step_conf": 5.2109375,
"calib/ece": 0.15976562499999997,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.76953125,
"calib/gap": 0.34566081871345034,
"calib/mean_conf": 0.8459375,
"calib/mu_c": 0.9485555555555556,
"calib/mu_w": 0.6028947368421053,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15128906249999996,
"calib/std_conf": 0.3013004042376147,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5316342857142857,
"calib/step_q_c_n": 875.0,
"calib/step_q_gap": 0.12318308745720508,
"calib/step_q_w": 0.40845119825708065,
"calib/step_q_w_n": 459.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1169.0,
"completions/max_terminated_length": 1169.0,
"completions/mean_length": 414.76171875,
"completions/mean_terminated_length": 416.3882751464844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.1024,
"grad_norm": 0.040975358337163925,
"kl": 0.114013671875,
"learning_rate": 2.888888888888889e-06,
"loss": 0.0935,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04015825688838959,
"mask/share_reasoning": 0.8285998702049255,
"mask/share_step_conf": 0.12733563780784607,
"num_tokens": 22431702.0,
"reward": 1.0118509531021118,
"reward_std": 0.14020463824272156,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.8243891000747681,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8586878776550293,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.6050560474395752,
"adv/mean_abs_reasoning": 0.5235856771469116,
"adv/mean_abs_step_conf": 0.7516707181930542,
"adv/ratio_final_to_reasoning": 1.1556008383128555,
"adv/ratio_step_to_reasoning": 1.4356212383215072,
"adv/std_final_conf": 0.8272944688796997,
"adv/std_reasoning": 0.7754360437393188,
"adv/std_step_conf": 0.9345774054527283,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6281774109014676,
"calib/avg_num_step_conf": 5.125,
"calib/ece": 0.31728,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.768,
"calib/gap": 0.18736242138364767,
"calib/mean_conf": 0.8415999999999999,
"calib/mu_c": 0.9210416666666666,
"calib/mu_w": 0.733679245283019,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.29144,
"calib/std_conf": 0.3124481396968143,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5813196480938417,
"calib/step_q_c_n": 682.0,
"calib/step_q_gap": 0.16146250523669892,
"calib/step_q_w": 0.4198571428571428,
"calib/step_q_w_n": 630.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3038.0,
"completions/max_terminated_length": 3038.0,
"completions/mean_length": 472.0859375,
"completions/mean_terminated_length": 472.0859375,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.048959359526634216,
"kl": 0.078338623046875,
"learning_rate": 2.861111111111111e-06,
"loss": -0.0059,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.037389181554317474,
"mask/share_reasoning": 0.839634358882904,
"mask/share_step_conf": 0.12297643721103668,
"num_tokens": 22657628.0,
"reward": 0.905328631401062,
"reward_std": 0.22340336441993713,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6632086038589478,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8396360874176025,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.5784022808074951,
"adv/mean_abs_reasoning": 0.532549262046814,
"adv/mean_abs_step_conf": 0.7428262233734131,
"adv/ratio_final_to_reasoning": 1.0861009901402332,
"adv/ratio_step_to_reasoning": 1.3948497844470107,
"adv/std_final_conf": 0.7927812337875366,
"adv/std_reasoning": 0.7927854061126709,
"adv/std_step_conf": 0.9343677759170532,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6593717992488903,
"calib/avg_num_step_conf": 4.671875,
"calib/ece": 0.2732520325203251,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.7642276422764228,
"calib/gap": 0.2434988050529191,
"calib/mean_conf": 0.8292682926829268,
"calib/mu_c": 0.9292413793103449,
"calib/mu_w": 0.6857425742574258,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.2565447154471544,
"calib/std_conf": 0.325585578104195,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5994912559618442,
"calib/step_q_c_n": 629.0,
"calib/step_q_gap": 0.11019495966554782,
"calib/step_q_w": 0.48929629629629634,
"calib/step_q_w_n": 567.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2677.0,
"completions/max_terminated_length": 2677.0,
"completions/mean_length": 503.44921875,
"completions/mean_terminated_length": 503.44921875,
"completions/min_length": 94.0,
"completions/min_terminated_length": 94.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.026320848613977432,
"kl": 0.07251739501953125,
"learning_rate": 2.8333333333333335e-06,
"loss": -0.0586,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.036959365010261536,
"mask/share_reasoning": 0.8525199890136719,
"mask/share_step_conf": 0.11052063852548599,
"num_tokens": 22892695.0,
"reward": 0.8927949666976929,
"reward_std": 0.22106841206550598,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.676842987537384,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8048408031463623,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.6866989135742188,
"adv/mean_abs_reasoning": 0.6365125775337219,
"adv/mean_abs_step_conf": 0.7182142734527588,
"adv/ratio_final_to_reasoning": 1.0788457884602256,
"adv/ratio_step_to_reasoning": 1.1283583369799293,
"adv/std_final_conf": 0.8761346936225891,
"adv/std_reasoning": 0.8589560389518738,
"adv/std_step_conf": 0.9351794123649597,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.727749229188078,
"calib/avg_num_step_conf": 5.62109375,
"calib/ece": 0.298688524590164,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.569672131147541,
"calib/gap": 0.2896731757451182,
"calib/mean_conf": 0.6904098360655739,
"calib/mu_c": 0.8554285714285713,
"calib/mu_w": 0.5657553956834531,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2793852459016394,
"calib/std_conf": 0.37926351741362724,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5856870229007634,
"calib/step_q_c_n": 524.0,
"calib/step_q_gap": 0.1374465857422934,
"calib/step_q_w": 0.44824043715846995,
"calib/step_q_w_n": 915.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2531.0,
"completions/max_terminated_length": 2531.0,
"completions/mean_length": 547.33203125,
"completions/mean_terminated_length": 560.468017578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.1056,
"grad_norm": 0.03607296198606491,
"kl": 0.0640716552734375,
"learning_rate": 2.805555555555556e-06,
"loss": -0.0437,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03173251822590828,
"mask/share_reasoning": 0.8410789966583252,
"mask/share_step_conf": 0.10375095903873444,
"num_tokens": 23138612.0,
"reward": 0.8512870073318481,
"reward_std": 0.25130918622016907,
"rewards/accuracy_reward_step": 0.41015625,
"rewards/final_brier_reward_step": 0.6493609547615051,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.7813380360603333,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.5975271463394165,
"adv/mean_abs_reasoning": 0.4342886209487915,
"adv/mean_abs_step_conf": 0.7729626893997192,
"adv/ratio_final_to_reasoning": 1.3758756677391117,
"adv/ratio_step_to_reasoning": 1.7798363855608872,
"adv/std_final_conf": 0.838344395160675,
"adv/std_reasoning": 0.6816809773445129,
"adv/std_step_conf": 0.9339935183525085,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7793527508090615,
"calib/avg_num_step_conf": 5.21484375,
"calib/ece": 0.18007075098814238,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6442687747035574,
"calib/gap": 0.41364636245954683,
"calib/mean_conf": 0.7285458498023716,
"calib/mu_c": 0.8969473333333333,
"calib/mu_w": 0.4833009708737865,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1578656126482214,
"calib/std_conf": 0.3833230272162609,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5946691176470589,
"calib/step_q_c_n": 816.0,
"calib/step_q_gap": 0.1484070752578489,
"calib/step_q_w": 0.44626204238920997,
"calib/step_q_w_n": 519.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1848.0,
"completions/max_terminated_length": 1848.0,
"completions/mean_length": 501.37890625,
"completions/mean_terminated_length": 507.3241271972656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.033330369740724564,
"kl": 0.06787872314453125,
"learning_rate": 2.7777777777777783e-06,
"loss": -0.0849,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03490392118692398,
"mask/share_reasoning": 0.8391332626342773,
"mask/share_step_conf": 0.11424408107995987,
"num_tokens": 23374373.0,
"reward": 0.9700980186462402,
"reward_std": 0.1729770302772522,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7836803197860718,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8416720628738403,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.656168520450592,
"adv/mean_abs_reasoning": 0.4810579717159271,
"adv/mean_abs_step_conf": 0.7474485635757446,
"adv/ratio_final_to_reasoning": 1.3640113230221464,
"adv/ratio_step_to_reasoning": 1.5537598533282921,
"adv/std_final_conf": 0.8590138554573059,
"adv/std_reasoning": 0.7576479911804199,
"adv/std_step_conf": 0.9346694946289062,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7411160831797843,
"calib/avg_num_step_conf": 5.92578125,
"calib/ece": 0.24054412955465587,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.48582995951417,
"calib/gap": 0.3265280600157936,
"calib/mean_conf": 0.6508728744939272,
"calib/mu_c": 0.824051724137931,
"calib/mu_w": 0.49752366412213744,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.21089068825910934,
"calib/std_conf": 0.3882587265444154,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5500429184549357,
"calib/step_q_c_n": 699.0,
"calib/step_q_gap": 0.09687665928623151,
"calib/step_q_w": 0.4531662591687042,
"calib/step_q_w_n": 818.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2952.0,
"completions/max_terminated_length": 2952.0,
"completions/mean_length": 559.57421875,
"completions/mean_terminated_length": 563.9802856445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.043247781693935394,
"kl": 0.07190704345703125,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0092,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.033356837928295135,
"mask/share_reasoning": 0.8435466885566711,
"mask/share_step_conf": 0.11528396606445312,
"num_tokens": 23624616.0,
"reward": 0.8842967748641968,
"reward_std": 0.21739190816879272,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.7008475065231323,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7849335074424744,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.5095280408859253,
"adv/mean_abs_reasoning": 0.31084319949150085,
"adv/mean_abs_step_conf": 0.7629257440567017,
"adv/ratio_final_to_reasoning": 1.6391802739112422,
"adv/ratio_step_to_reasoning": 2.454374891600489,
"adv/std_final_conf": 0.7696377635002136,
"adv/std_reasoning": 0.6184869408607483,
"adv/std_step_conf": 0.9334552884101868,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.783984165324745,
"calib/avg_num_step_conf": 5.06640625,
"calib/ece": 0.15271732283464579,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6377952755905512,
"calib/gap": 0.42152310789049907,
"calib/mean_conf": 0.7429519685039369,
"calib/mu_c": 0.8956296296296296,
"calib/mu_w": 0.4741065217391305,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.12893700787401588,
"calib/std_conf": 0.36587584193100725,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5906177606177606,
"calib/step_q_c_n": 777.0,
"calib/step_q_gap": 0.16017218369468372,
"calib/step_q_w": 0.4304455769230769,
"calib/step_q_w_n": 520.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1458.0,
"completions/max_terminated_length": 1458.0,
"completions/mean_length": 408.7421875,
"completions/mean_terminated_length": 410.3451232910156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.1088,
"grad_norm": 0.03269050642848015,
"kl": 0.0802764892578125,
"learning_rate": 2.7222222222222224e-06,
"loss": -0.0296,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04328569769859314,
"mask/share_reasoning": 0.8195648193359375,
"mask/share_step_conf": 0.13324323296546936,
"num_tokens": 23835950.0,
"reward": 0.9991936087608337,
"reward_std": 0.1344832181930542,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.8124216198921204,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8609656095504761,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.5931665897369385,
"adv/mean_abs_reasoning": 0.4472111463546753,
"adv/mean_abs_step_conf": 0.7450423240661621,
"adv/ratio_final_to_reasoning": 1.3263680804290787,
"adv/ratio_step_to_reasoning": 1.665974406360798,
"adv/std_final_conf": 0.8274978995323181,
"adv/std_reasoning": 0.7392292022705078,
"adv/std_step_conf": 0.9343953132629395,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7972413793103449,
"calib/avg_num_step_conf": 6.109375,
"calib/ece": 0.1719200000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.64,
"calib/gap": 0.43064039408866994,
"calib/mean_conf": 0.7312000000000001,
"calib/mu_c": 0.9120689655172414,
"calib/mu_w": 0.48142857142857143,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16156000000000012,
"calib/std_conf": 0.3776561398944813,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6325307391091531,
"calib/step_q_c_n": 681.0,
"calib/step_q_gap": 0.19963153186113503,
"calib/step_q_w": 0.4328992072480181,
"calib/step_q_w_n": 883.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2704.0,
"completions/max_terminated_length": 2704.0,
"completions/mean_length": 590.5390625,
"completions/mean_terminated_length": 590.5390625,
"completions/min_length": 107.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.02490939199924469,
"kl": 0.06413650512695312,
"learning_rate": 2.6944444444444444e-06,
"loss": 0.1038,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03427442908287048,
"mask/share_reasoning": 0.8512352108955383,
"mask/share_step_conf": 0.11449037492275238,
"num_tokens": 24091680.0,
"reward": 0.9671297073364258,
"reward_std": 0.19709224998950958,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7819554805755615,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8437101244926453,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.6492934823036194,
"adv/mean_abs_reasoning": 0.47703713178634644,
"adv/mean_abs_step_conf": 0.7521624565124512,
"adv/ratio_final_to_reasoning": 1.3610963152327993,
"adv/ratio_step_to_reasoning": 1.5767377556037436,
"adv/std_final_conf": 0.8646968603134155,
"adv/std_reasoning": 0.7393373847007751,
"adv/std_step_conf": 0.9346559643745422,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.794665404040404,
"calib/avg_num_step_conf": 5.3984375,
"calib/ece": 0.16884920634920625,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.42857142857142855,
"calib/gap": 0.45039393939393946,
"calib/mean_conf": 0.5525793650793651,
"calib/mu_c": 0.7885000000000001,
"calib/mu_w": 0.33810606060606063,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12261904761904756,
"calib/std_conf": 0.4203844773306327,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6118739352640544,
"calib/step_q_c_n": 587.0,
"calib/step_q_gap": 0.17602487866028083,
"calib/step_q_w": 0.4358490566037736,
"calib/step_q_w_n": 795.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2705.0,
"completions/max_terminated_length": 2705.0,
"completions/mean_length": 517.01171875,
"completions/mean_terminated_length": 517.01171875,
"completions/min_length": 123.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.04363548383116722,
"kl": 0.0774993896484375,
"learning_rate": 2.666666666666667e-06,
"loss": 0.04,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03508147969841957,
"mask/share_reasoning": 0.8457739949226379,
"mask/share_step_conf": 0.119144506752491,
"num_tokens": 24330715.0,
"reward": 0.941875696182251,
"reward_std": 0.17754782736301422,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7803089618682861,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8128174543380737,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.6836636662483215,
"adv/mean_abs_reasoning": 0.5900191068649292,
"adv/mean_abs_step_conf": 0.7682121992111206,
"adv/ratio_final_to_reasoning": 1.1587144522843902,
"adv/ratio_step_to_reasoning": 1.3020124098913028,
"adv/std_final_conf": 0.889095664024353,
"adv/std_reasoning": 0.8100023865699768,
"adv/std_step_conf": 0.9345039129257202,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8001728953318261,
"calib/avg_num_step_conf": 5.3828125,
"calib/ece": 0.15961686746987958,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5140562248995983,
"calib/gap": 0.4813849049075675,
"calib/mean_conf": 0.6007044176706827,
"calib/mu_c": 0.7998315068493151,
"calib/mu_w": 0.3184466019417476,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.08698795180722894,
"calib/std_conf": 0.42927057589576506,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5579889807162535,
"calib/step_q_c_n": 726.0,
"calib/step_q_gap": 0.13505953286349276,
"calib/step_q_w": 0.4229294478527607,
"calib/step_q_w_n": 652.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1921.0,
"completions/max_terminated_length": 1921.0,
"completions/mean_length": 476.15234375,
"completions/mean_terminated_length": 481.7984313964844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.112,
"grad_norm": 0.0338783822953701,
"kl": 0.072052001953125,
"learning_rate": 2.6388888888888893e-06,
"loss": -0.1031,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03626061603426933,
"mask/share_reasoning": 0.8372711539268494,
"mask/share_step_conf": 0.11474946141242981,
"num_tokens": 24558370.0,
"reward": 0.9632641673088074,
"reward_std": 0.21223922073841095,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7844381332397461,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8334963917732239,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.6210803389549255,
"adv/mean_abs_reasoning": 0.4079238772392273,
"adv/mean_abs_step_conf": 0.7573671340942383,
"adv/ratio_final_to_reasoning": 1.5225398011960267,
"adv/ratio_step_to_reasoning": 1.856638398369801,
"adv/std_final_conf": 0.8601810932159424,
"adv/std_reasoning": 0.7012488842010498,
"adv/std_step_conf": 0.9338719248771667,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7499840835296364,
"calib/avg_num_step_conf": 5.27734375,
"calib/ece": 0.20079365079365077,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.48412698412698413,
"calib/gap": 0.3779970713694532,
"calib/mean_conf": 0.5992063492063492,
"calib/mu_c": 0.7687050359712231,
"calib/mu_w": 0.3907079646017699,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1242063492063492,
"calib/std_conf": 0.41771791798689534,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5683842794759826,
"calib/step_q_c_n": 687.0,
"calib/step_q_gap": 0.14180295417477778,
"calib/step_q_w": 0.4265813253012048,
"calib/step_q_w_n": 664.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2447.0,
"completions/max_terminated_length": 2447.0,
"completions/mean_length": 475.50390625,
"completions/mean_terminated_length": 477.36865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.037723153829574585,
"kl": 0.123046875,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.0421,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03565295785665512,
"mask/share_reasoning": 0.8457275629043579,
"mask/share_step_conf": 0.11471326649188995,
"num_tokens": 24784683.0,
"reward": 0.9509889483451843,
"reward_std": 0.17380815744400024,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7509718537330627,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8455371856689453,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.6779407262802124,
"adv/mean_abs_reasoning": 0.493292897939682,
"adv/mean_abs_step_conf": 0.7233132123947144,
"adv/ratio_final_to_reasoning": 1.3743168188955124,
"adv/ratio_step_to_reasoning": 1.4662956134494325,
"adv/std_final_conf": 0.8676726818084717,
"adv/std_reasoning": 0.7575341463088989,
"adv/std_step_conf": 0.9344541430473328,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6941146102219928,
"calib/avg_num_step_conf": 5.72265625,
"calib/ece": 0.24161067193675892,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.549407114624506,
"calib/gap": 0.29727607124419203,
"calib/mean_conf": 0.6545948616600791,
"calib/mu_c": 0.7767953020134228,
"calib/mu_w": 0.4795192307692308,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15363636363636368,
"calib/std_conf": 0.3998723349794255,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5191130012150669,
"calib/step_q_c_n": 823.0,
"calib/step_q_gap": 0.11733730028048733,
"calib/step_q_w": 0.40177570093457954,
"calib/step_q_w_n": 642.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2696.0,
"completions/max_terminated_length": 2696.0,
"completions/mean_length": 468.5078125,
"completions/mean_terminated_length": 468.5078125,
"completions/min_length": 136.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.0470091886818409,
"kl": 0.07814788818359375,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.0497,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03784269094467163,
"mask/share_reasoning": 0.8325836658477783,
"mask/share_step_conf": 0.12957364320755005,
"num_tokens": 25009237.0,
"reward": 0.9420108795166016,
"reward_std": 0.18782854080200195,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7289911508560181,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8409680128097534,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.5114879608154297,
"adv/mean_abs_reasoning": 0.3971118927001953,
"adv/mean_abs_step_conf": 0.7521694898605347,
"adv/ratio_final_to_reasoning": 1.2880197501452924,
"adv/ratio_step_to_reasoning": 1.8940996320863013,
"adv/std_final_conf": 0.7579714059829712,
"adv/std_reasoning": 0.6816282868385315,
"adv/std_step_conf": 0.9336187243461609,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6744449846054124,
"calib/avg_num_step_conf": 5.8125,
"calib/ece": 0.22449275362318838,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.691699604743083,
"calib/gap": 0.30871657754010695,
"calib/mean_conf": 0.7433333333333333,
"calib/mu_c": 0.8238680926916221,
"calib/mu_w": 0.5151515151515151,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.11434782608695654,
"calib/std_conf": 0.3995920185528892,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5359702970297029,
"calib/step_q_c_n": 1010.0,
"calib/step_q_gap": 0.14574017150669027,
"calib/step_q_w": 0.3902301255230126,
"calib/step_q_w_n": 478.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2290.0,
"completions/max_terminated_length": 2290.0,
"completions/mean_length": 521.9921875,
"completions/mean_terminated_length": 521.9921875,
"completions/min_length": 149.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.1152,
"grad_norm": 0.031734488904476166,
"kl": 0.0790863037109375,
"learning_rate": 2.5555555555555557e-06,
"loss": -0.0242,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03476386517286301,
"mask/share_reasoning": 0.8393080234527588,
"mask/share_step_conf": 0.12592805922031403,
"num_tokens": 25246099.0,
"reward": 0.9783648252487183,
"reward_std": 0.16985079646110535,
"rewards/accuracy_reward_step": 0.73046875,
"rewards/final_brier_reward_step": 0.7575603723526001,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8554192185401917,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.6080666780471802,
"adv/mean_abs_reasoning": 0.44817543029785156,
"adv/mean_abs_step_conf": 0.7608993053436279,
"adv/ratio_final_to_reasoning": 1.356760404386887,
"adv/ratio_step_to_reasoning": 1.6977711268954305,
"adv/std_final_conf": 0.8206971883773804,
"adv/std_reasoning": 0.7392958402633667,
"adv/std_step_conf": 0.9336312413215637,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.8191,
"calib/avg_num_step_conf": 6.29296875,
"calib/ece": 0.16285714285714276,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.4489795918367347,
"calib/gap": 0.5080299999999999,
"calib/mean_conf": 0.5344489795918367,
"calib/mu_c": 0.7832799999999999,
"calib/mu_w": 0.27525,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.09355102040816317,
"calib/std_conf": 0.4471490214647886,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5017307152875176,
"calib/step_q_c_n": 713.0,
"calib/step_q_gap": 0.199670581657228,
"calib/step_q_w": 0.30206013363028955,
"calib/step_q_w_n": 898.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3022.0,
"completions/max_terminated_length": 3022.0,
"completions/mean_length": 556.76953125,
"completions/mean_terminated_length": 563.37158203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.060035791248083115,
"kl": 0.07425689697265625,
"learning_rate": 2.5277777777777778e-06,
"loss": -0.0473,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.031296491622924805,
"mask/share_reasoning": 0.8337397575378418,
"mask/share_step_conf": 0.12324501574039459,
"num_tokens": 25493232.0,
"reward": 0.9433398842811584,
"reward_std": 0.19780653715133667,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7644991874694824,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8346804976463318,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.6949703693389893,
"adv/mean_abs_reasoning": 0.5166112184524536,
"adv/mean_abs_step_conf": 0.7524176836013794,
"adv/ratio_final_to_reasoning": 1.345248311526845,
"adv/ratio_step_to_reasoning": 1.4564485956292261,
"adv/std_final_conf": 0.8908286094665527,
"adv/std_reasoning": 0.7576285004615784,
"adv/std_step_conf": 0.9336581230163574,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6592592592592593,
"calib/avg_num_step_conf": 5.07421875,
"calib/ece": 0.3303187250996016,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.46215139442231074,
"calib/gap": 0.22483588761174977,
"calib/mean_conf": 0.5544621513944223,
"calib/mu_c": 0.6583703703703704,
"calib/mu_w": 0.4335344827586206,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1734661354581673,
"calib/std_conf": 0.4403701919272022,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.49868113522537566,
"calib/step_q_c_n": 599.0,
"calib/step_q_gap": 0.1533811352253756,
"calib/step_q_w": 0.34530000000000005,
"calib/step_q_w_n": 700.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2472.0,
"completions/max_terminated_length": 2472.0,
"completions/mean_length": 490.71875,
"completions/mean_terminated_length": 490.71875,
"completions/min_length": 113.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.031973760575056076,
"kl": 0.079925537109375,
"learning_rate": 2.5e-06,
"loss": 0.045,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.036116331815719604,
"mask/share_reasoning": 0.8464046716690063,
"mask/share_step_conf": 0.11747899651527405,
"num_tokens": 25723776.0,
"reward": 0.89471036195755,
"reward_std": 0.18601158261299133,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.655937910079956,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.831920325756073,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.5667400360107422,
"adv/mean_abs_reasoning": 0.47426918148994446,
"adv/mean_abs_step_conf": 0.7553653717041016,
"adv/ratio_final_to_reasoning": 1.194975465684477,
"adv/ratio_step_to_reasoning": 1.5926933504957606,
"adv/std_final_conf": 0.7973593473434448,
"adv/std_reasoning": 0.7393887639045715,
"adv/std_step_conf": 0.9348477721214294,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7744879201680672,
"calib/avg_num_step_conf": 5.15234375,
"calib/ece": 0.21139784946236562,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5080645161290323,
"calib/gap": 0.45991946778711484,
"calib/mean_conf": 0.5748924731182795,
"calib/mu_c": 0.7825980392156863,
"calib/mu_w": 0.3226785714285714,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.11895161290322583,
"calib/std_conf": 0.45134197262340864,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5010180623973728,
"calib/step_q_c_n": 609.0,
"calib/step_q_gap": 0.18886313281990802,
"calib/step_q_w": 0.31215492957746477,
"calib/step_q_w_n": 710.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2971.0,
"completions/max_terminated_length": 2971.0,
"completions/mean_length": 515.125,
"completions/mean_terminated_length": 519.1810913085938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.1184,
"grad_norm": 0.030626846477389336,
"kl": 0.07135009765625,
"learning_rate": 2.4722222222222226e-06,
"loss": 0.041,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.037555031478405,
"mask/share_reasoning": 0.8396614789962769,
"mask/share_step_conf": 0.11497093737125397,
"num_tokens": 25963056.0,
"reward": 0.9370782375335693,
"reward_std": 0.18119025230407715,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7514935731887817,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8226628303527832,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.6594914793968201,
"adv/mean_abs_reasoning": 0.5638391375541687,
"adv/mean_abs_step_conf": 0.7696518898010254,
"adv/ratio_final_to_reasoning": 1.1696447363650098,
"adv/ratio_step_to_reasoning": 1.3650203374310532,
"adv/std_final_conf": 0.8604232668876648,
"adv/std_reasoning": 0.8266275525093079,
"adv/std_step_conf": 0.9347512722015381,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.7883694733877782,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.23908333333333331,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/gap": 0.4711236271472825,
"calib/mean_conf": 0.3926666666666667,
"calib/mu_c": 0.6007462686567164,
"calib/mu_w": 0.12962264150943395,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.03670833333333334,
"calib/std_conf": 0.44674700769998327,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.43881305637982193,
"calib/step_q_c_n": 674.0,
"calib/step_q_gap": 0.18350478570313017,
"calib/step_q_w": 0.25530827067669176,
"calib/step_q_w_n": 665.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2900.0,
"completions/max_terminated_length": 2900.0,
"completions/mean_length": 571.58203125,
"completions/mean_terminated_length": 585.300048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.03158825263381004,
"kl": 0.06920623779296875,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.0995,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.028736168518662453,
"mask/share_reasoning": 0.8447258472442627,
"mask/share_step_conf": 0.1031005010008812,
"num_tokens": 26217301.0,
"reward": 0.8999679088592529,
"reward_std": 0.23615305125713348,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7113093733787537,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.7964389324188232,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.6749004125595093,
"adv/mean_abs_reasoning": 0.47871798276901245,
"adv/mean_abs_step_conf": 0.7707317471504211,
"adv/ratio_final_to_reasoning": 1.409807938811351,
"adv/ratio_step_to_reasoning": 1.6099912158977931,
"adv/std_final_conf": 0.8600090742111206,
"adv/std_reasoning": 0.7206716537475586,
"adv/std_step_conf": 0.9334895014762878,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7614010989010989,
"calib/avg_num_step_conf": 6.1484375,
"calib/ece": 0.23681274900398408,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.41832669322709165,
"calib/gap": 0.4297637362637362,
"calib/mean_conf": 0.5176892430278884,
"calib/mu_c": 0.6735,
"calib/mu_w": 0.24373626373626378,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05852589641434269,
"calib/std_conf": 0.44088764033280836,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42194539249146756,
"calib/step_q_c_n": 879.0,
"calib/step_q_gap": 0.18594539249146755,
"calib/step_q_w": 0.23600000000000002,
"calib/step_q_w_n": 695.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2987.0,
"completions/max_terminated_length": 2987.0,
"completions/mean_length": 482.6875,
"completions/mean_terminated_length": 486.4881896972656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.04572642967104912,
"kl": 0.08179473876953125,
"learning_rate": 2.4166666666666667e-06,
"loss": -0.0577,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.035985179245471954,
"mask/share_reasoning": 0.8264555931091309,
"mask/share_step_conf": 0.129746675491333,
"num_tokens": 26446069.0,
"reward": 0.9624725580215454,
"reward_std": 0.17017316818237305,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7439906597137451,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8598606586456299,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.6364820599555969,
"adv/mean_abs_reasoning": 0.38433846831321716,
"adv/mean_abs_step_conf": 0.7515975832939148,
"adv/ratio_final_to_reasoning": 1.6560456796036742,
"adv/ratio_step_to_reasoning": 1.9555616865325052,
"adv/std_final_conf": 0.8455886840820312,
"adv/std_reasoning": 0.6612743735313416,
"adv/std_step_conf": 0.9331404566764832,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8306848404255318,
"calib/avg_num_step_conf": 5.7890625,
"calib/ece": 0.1618503937007874,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.4881889763779528,
"calib/gap": 0.528344414893617,
"calib/mean_conf": 0.5772834645669291,
"calib/mu_c": 0.7728125,
"calib/mu_w": 0.244468085106383,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.054606299212598436,
"calib/std_conf": 0.43322824806726284,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.46794717887154863,
"calib/step_q_c_n": 833.0,
"calib/step_q_gap": 0.1828470247883437,
"calib/step_q_w": 0.28510015408320494,
"calib/step_q_w_n": 649.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2288.0,
"completions/max_terminated_length": 2288.0,
"completions/mean_length": 510.55078125,
"completions/mean_terminated_length": 512.552978515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.1216,
"grad_norm": 0.04408552497625351,
"kl": 0.0717620849609375,
"learning_rate": 2.388888888888889e-06,
"loss": 0.0212,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03622880205512047,
"mask/share_reasoning": 0.8335362672805786,
"mask/share_step_conf": 0.12632864713668823,
"num_tokens": 26681794.0,
"reward": 1.0109832286834717,
"reward_std": 0.14032113552093506,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.816330075263977,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.882198691368103,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.7068969011306763,
"adv/mean_abs_reasoning": 0.4891188144683838,
"adv/mean_abs_step_conf": 0.7457759380340576,
"adv/ratio_final_to_reasoning": 1.445245777141066,
"adv/ratio_step_to_reasoning": 1.5247336965449403,
"adv/std_final_conf": 0.8840060830116272,
"adv/std_reasoning": 0.7392686009407043,
"adv/std_step_conf": 0.934516966342926,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6875624687656172,
"calib/avg_num_step_conf": 5.09375,
"calib/ece": 0.29598425196850386,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.43700787401574803,
"calib/gap": 0.2785732133933033,
"calib/mean_conf": 0.5239370078740158,
"calib/mu_c": 0.6511594202898551,
"calib/mu_w": 0.37258620689655175,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13830708661417318,
"calib/std_conf": 0.44368208359993827,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42582047685834507,
"calib/step_q_c_n": 713.0,
"calib/step_q_gap": 0.08035516382958025,
"calib/step_q_w": 0.3454653130287648,
"calib/step_q_w_n": 591.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1066.0,
"completions/max_terminated_length": 1066.0,
"completions/mean_length": 441.0546875,
"completions/mean_terminated_length": 442.7843322753906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.05775079503655434,
"kl": 0.08318328857421875,
"learning_rate": 2.361111111111111e-06,
"loss": 0.008,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03622948005795479,
"mask/share_reasoning": 0.8380911350250244,
"mask/share_step_conf": 0.12177319079637527,
"num_tokens": 26899968.0,
"reward": 0.9117941856384277,
"reward_std": 0.18091896176338196,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6874749660491943,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.828300952911377,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.6373252868652344,
"adv/mean_abs_reasoning": 0.3526563048362732,
"adv/mean_abs_step_conf": 0.7363879680633545,
"adv/ratio_final_to_reasoning": 1.807213647182981,
"adv/ratio_step_to_reasoning": 2.0881179719875855,
"adv/std_final_conf": 0.8416139483451843,
"adv/std_reasoning": 0.6403775811195374,
"adv/std_step_conf": 0.9332254528999329,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7618092105263158,
"calib/avg_num_step_conf": 5.59765625,
"calib/ece": 0.22785714285714284,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.46825396825396826,
"calib/gap": 0.3854131578947369,
"calib/mean_conf": 0.5585714285714285,
"calib/mu_c": 0.7115131578947369,
"calib/mu_w": 0.3261,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.09162698412698413,
"calib/std_conf": 0.44005449569760496,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4407152317880795,
"calib/step_q_c_n": 755.0,
"calib/step_q_gap": 0.1707742288382269,
"calib/step_q_w": 0.26994100294985257,
"calib/step_q_w_n": 678.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2713.0,
"completions/max_terminated_length": 2713.0,
"completions/mean_length": 513.18359375,
"completions/mean_terminated_length": 517.2244262695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.049823228269815445,
"kl": 0.07218170166015625,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.0028,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03470658138394356,
"mask/share_reasoning": 0.8393930792808533,
"mask/share_step_conf": 0.11808786541223526,
"num_tokens": 27135863.0,
"reward": 0.9575741291046143,
"reward_std": 0.1541958898305893,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7338913679122925,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8664129972457886,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.7075223326683044,
"adv/mean_abs_reasoning": 0.48216667771339417,
"adv/mean_abs_step_conf": 0.7466195821762085,
"adv/ratio_final_to_reasoning": 1.467381230124875,
"adv/ratio_step_to_reasoning": 1.548467815563166,
"adv/std_final_conf": 0.8886599540710449,
"adv/std_reasoning": 0.7393694519996643,
"adv/std_step_conf": 0.9349843263626099,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6939594749585829,
"calib/avg_num_step_conf": 5.55078125,
"calib/ece": 0.2669322709163347,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.46613545816733065,
"calib/gap": 0.281900089206066,
"calib/mean_conf": 0.5914741035856573,
"calib/mu_c": 0.7408474576271186,
"calib/mu_w": 0.45894736842105255,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1941434262948208,
"calib/std_conf": 0.41703351247183346,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.44488524590163936,
"calib/step_q_c_n": 610.0,
"calib/step_q_gap": 0.10553876008166396,
"calib/step_q_w": 0.3393464858199754,
"calib/step_q_w_n": 811.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2839.0,
"completions/max_terminated_length": 2839.0,
"completions/mean_length": 511.55078125,
"completions/mean_terminated_length": 513.556884765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.1248,
"grad_norm": 0.043811291456222534,
"kl": 0.07315826416015625,
"learning_rate": 2.305555555555556e-06,
"loss": -0.0016,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03278231620788574,
"mask/share_reasoning": 0.8445600271224976,
"mask/share_step_conf": 0.1187513917684555,
"num_tokens": 27373420.0,
"reward": 0.9010759592056274,
"reward_std": 0.21563945710659027,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.681955873966217,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8334771990776062,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.6128525733947754,
"adv/mean_abs_reasoning": 0.3557014465332031,
"adv/mean_abs_step_conf": 0.7667481899261475,
"adv/ratio_final_to_reasoning": 1.7229409083596976,
"adv/ratio_step_to_reasoning": 2.155594804010939,
"adv/std_final_conf": 0.8230372071266174,
"adv/std_reasoning": 0.640249490737915,
"adv/std_step_conf": 0.9342384934425354,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.755758695795657,
"calib/avg_num_step_conf": 6.54296875,
"calib/ece": 0.22661290322580652,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5887096774193549,
"calib/gap": 0.33595274239324147,
"calib/mean_conf": 0.7010483870967742,
"calib/mu_c": 0.848705035971223,
"calib/mu_w": 0.5127522935779816,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.18358870967741941,
"calib/std_conf": 0.3858815181448648,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.42637698898408816,
"calib/step_q_c_n": 817.0,
"calib/step_q_gap": 0.10349820110530022,
"calib/step_q_w": 0.32287878787878793,
"calib/step_q_w_n": 858.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2961.0,
"completions/max_terminated_length": 2961.0,
"completions/mean_length": 562.41015625,
"completions/mean_terminated_length": 566.8385620117188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.038998398929834366,
"kl": 0.0649261474609375,
"learning_rate": 2.277777777777778e-06,
"loss": -0.0288,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03201957419514656,
"mask/share_reasoning": 0.8347534537315369,
"mask/share_step_conf": 0.12541446089744568,
"num_tokens": 27621405.0,
"reward": 0.9234127402305603,
"reward_std": 0.18003970384597778,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7270601391792297,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.817421555519104,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.69493168592453,
"adv/mean_abs_reasoning": 0.5836117267608643,
"adv/mean_abs_step_conf": 0.7426487803459167,
"adv/ratio_final_to_reasoning": 1.190743184311098,
"adv/ratio_step_to_reasoning": 1.2725048971646489,
"adv/std_final_conf": 0.8770858645439148,
"adv/std_reasoning": 0.8099712133407593,
"adv/std_step_conf": 0.9345157146453857,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7353337374553781,
"calib/avg_num_step_conf": 5.81640625,
"calib/ece": 0.24382661290322566,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5564516129032258,
"calib/gap": 0.3643693001953257,
"calib/mean_conf": 0.6318185483870967,
"calib/mu_c": 0.7802108843537415,
"calib/mu_w": 0.4158415841584158,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.14145161290322567,
"calib/std_conf": 0.4348907947584482,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4687263556116015,
"calib/step_q_c_n": 793.0,
"calib/step_q_gap": 0.161686125726544,
"calib/step_q_w": 0.30704022988505747,
"calib/step_q_w_n": 696.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2572.0,
"completions/max_terminated_length": 2572.0,
"completions/mean_length": 559.5546875,
"completions/mean_terminated_length": 566.1897583007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.03349678963422775,
"kl": 0.06784820556640625,
"learning_rate": 2.25e-06,
"loss": -0.0964,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03033093363046646,
"mask/share_reasoning": 0.8460862636566162,
"mask/share_step_conf": 0.11186406016349792,
"num_tokens": 27869715.0,
"reward": 0.9392236471176147,
"reward_std": 0.23281517624855042,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7167088985443115,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8539257645606995,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.616726279258728,
"adv/mean_abs_reasoning": 0.47547075152397156,
"adv/mean_abs_step_conf": 0.7524898052215576,
"adv/ratio_final_to_reasoning": 1.2970856299404463,
"adv/ratio_step_to_reasoning": 1.5826205982380386,
"adv/std_final_conf": 0.8283965587615967,
"adv/std_reasoning": 0.7575518488883972,
"adv/std_step_conf": 0.9344828128814697,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.8128576474743907,
"calib/avg_num_step_conf": 5.22265625,
"calib/ece": 0.19245901639344262,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.569672131147541,
"calib/gap": 0.48753373366301667,
"calib/mean_conf": 0.6265573770491804,
"calib/mu_c": 0.8163758389261745,
"calib/mu_w": 0.3288421052631578,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.10418032786885248,
"calib/std_conf": 0.44613416068580203,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.43939470365699873,
"calib/step_q_c_n": 793.0,
"calib/step_q_gap": 0.13764838012758696,
"calib/step_q_w": 0.3017463235294118,
"calib/step_q_w_n": 544.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2755.0,
"completions/max_terminated_length": 2755.0,
"completions/mean_length": 499.078125,
"completions/mean_terminated_length": 503.00787353515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.128,
"grad_norm": 0.04359474778175354,
"kl": 0.078948974609375,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0079,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03324298560619354,
"mask/share_reasoning": 0.845354437828064,
"mask/share_step_conf": 0.11359011381864548,
"num_tokens": 28104167.0,
"reward": 0.9440826177597046,
"reward_std": 0.20472922921180725,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7542468309402466,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.827668309211731,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.7143666744232178,
"adv/mean_abs_reasoning": 0.517063319683075,
"adv/mean_abs_step_conf": 0.7503951787948608,
"adv/ratio_final_to_reasoning": 1.381584512436652,
"adv/ratio_step_to_reasoning": 1.4512636078204166,
"adv/std_final_conf": 0.8769662380218506,
"adv/std_reasoning": 0.7576488256454468,
"adv/std_step_conf": 0.9345428943634033,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7345043280470083,
"calib/avg_num_step_conf": 6.125,
"calib/ece": 0.23984126984126986,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5555555555555556,
"calib/gap": 0.3350906678460858,
"calib/mean_conf": 0.6763492063492064,
"calib/mu_c": 0.8345864661654135,
"calib/mu_w": 0.4994957983193277,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19420634920634922,
"calib/std_conf": 0.4057241286035576,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4496797671033479,
"calib/step_q_c_n": 687.0,
"calib/step_q_gap": 0.12050837096259875,
"calib/step_q_w": 0.32917139614074914,
"calib/step_q_w_n": 881.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2652.0,
"completions/max_terminated_length": 2652.0,
"completions/mean_length": 560.47265625,
"completions/mean_terminated_length": 564.8858032226562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.0363897942006588,
"kl": 0.067901611328125,
"learning_rate": 2.1944444444444445e-06,
"loss": 0.0397,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.031554028391838074,
"mask/share_reasoning": 0.8472678661346436,
"mask/share_step_conf": 0.11336560547351837,
"num_tokens": 28352704.0,
"reward": 0.933656632900238,
"reward_std": 0.20358788967132568,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7196906208992004,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8468413352966309,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.6004906296730042,
"adv/mean_abs_reasoning": 0.376475065946579,
"adv/mean_abs_step_conf": 0.7485677599906921,
"adv/ratio_final_to_reasoning": 1.595034263858028,
"adv/ratio_step_to_reasoning": 1.9883594630856969,
"adv/std_final_conf": 0.8289437294006348,
"adv/std_reasoning": 0.6612106561660767,
"adv/std_step_conf": 0.9336329698562622,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8526247258771928,
"calib/avg_num_step_conf": 6.37890625,
"calib/ece": 0.169233870967742,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6895161290322581,
"calib/gap": 0.5107017543859651,
"calib/mean_conf": 0.7434274193548387,
"calib/mu_c": 0.9411184210526317,
"calib/mu_w": 0.4304166666666666,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1498790322580646,
"calib/std_conf": 0.4027986103979277,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5285681818181818,
"calib/step_q_c_n": 880.0,
"calib/step_q_gap": 0.2499758843414221,
"calib/step_q_w": 0.2785922974767597,
"calib/step_q_w_n": 753.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2376.0,
"completions/max_terminated_length": 2376.0,
"completions/mean_length": 499.25,
"completions/mean_terminated_length": 509.1952209472656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.03530493378639221,
"kl": 0.07299041748046875,
"learning_rate": 2.166666666666667e-06,
"loss": -0.0542,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03175698220729828,
"mask/share_reasoning": 0.8294092416763306,
"mask/share_step_conf": 0.11930252611637115,
"num_tokens": 28587856.0,
"reward": 0.9849426746368408,
"reward_std": 0.18427860736846924,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7999886274337769,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8566153049468994,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.7172747850418091,
"adv/mean_abs_reasoning": 0.535598635673523,
"adv/mean_abs_step_conf": 0.7722024917602539,
"adv/ratio_final_to_reasoning": 1.3392020391161485,
"adv/ratio_step_to_reasoning": 1.4417558976586977,
"adv/std_final_conf": 0.8936556577682495,
"adv/std_reasoning": 0.7928531765937805,
"adv/std_step_conf": 0.9344001412391663,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7689959294436906,
"calib/avg_num_step_conf": 6.48828125,
"calib/ece": 0.22217213114754114,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.5860655737704918,
"calib/gap": 0.3767801899592945,
"calib/mean_conf": 0.6986475409836066,
"calib/mu_c": 0.8685074626865672,
"calib/mu_w": 0.4917272727272727,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1858196721311477,
"calib/std_conf": 0.39190609075308214,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4737954239569313,
"calib/step_q_c_n": 743.0,
"calib/step_q_gap": 0.16402418212686593,
"calib/step_q_w": 0.3097712418300654,
"calib/step_q_w_n": 918.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2910.0,
"completions/max_terminated_length": 2910.0,
"completions/mean_length": 599.44921875,
"completions/mean_terminated_length": 606.5573120117188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.1312,
"grad_norm": 0.042939551174640656,
"kl": 0.05826568603515625,
"learning_rate": 2.138888888888889e-06,
"loss": -0.0356,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.029819615185260773,
"mask/share_reasoning": 0.8451920747756958,
"mask/share_step_conf": 0.11326956003904343,
"num_tokens": 28846603.0,
"reward": 0.9171556830406189,
"reward_std": 0.22644327580928802,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7272871136665344,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8117117285728455,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.5510287284851074,
"adv/mean_abs_reasoning": 0.43313780426979065,
"adv/mean_abs_step_conf": 0.7770897150039673,
"adv/ratio_final_to_reasoning": 1.272178791722104,
"adv/ratio_step_to_reasoning": 1.7940934902092676,
"adv/std_final_conf": 0.7954535484313965,
"adv/std_reasoning": 0.7204844951629639,
"adv/std_step_conf": 0.9341284036636353,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7866544117647059,
"calib/avg_num_step_conf": 5.9765625,
"calib/ece": 0.15532,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.692,
"calib/gap": 0.4443235294117646,
"calib/mean_conf": 0.76964,
"calib/mu_c": 0.9118235294117647,
"calib/mu_w": 0.4675000000000001,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.12248,
"calib/std_conf": 0.3741586166320375,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.498728813559322,
"calib/step_q_c_n": 944.0,
"calib/step_q_gap": 0.13524758489038002,
"calib/step_q_w": 0.363481228668942,
"calib/step_q_w_n": 586.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2706.0,
"completions/max_terminated_length": 2706.0,
"completions/mean_length": 536.6171875,
"completions/mean_terminated_length": 542.9802856445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.03145952895283699,
"kl": 0.065338134765625,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.0023,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.0311250202357769,
"mask/share_reasoning": 0.8385068774223328,
"mask/share_step_conf": 0.11864937096834183,
"num_tokens": 29090793.0,
"reward": 0.9903885722160339,
"reward_std": 0.17230892181396484,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.8046581745147705,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8487750291824341,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.6158657073974609,
"adv/mean_abs_reasoning": 0.4719720482826233,
"adv/mean_abs_step_conf": 0.7359480857849121,
"adv/ratio_final_to_reasoning": 1.3048775020436638,
"adv/ratio_step_to_reasoning": 1.559304387755218,
"adv/std_final_conf": 0.8484397530555725,
"adv/std_reasoning": 0.7394078373908997,
"adv/std_step_conf": 0.9346246719360352,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6562948467058056,
"calib/avg_num_step_conf": 5.60546875,
"calib/ece": 0.2996414342629482,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6533864541832669,
"calib/gap": 0.2253248532289629,
"calib/mean_conf": 0.7264940239043826,
"calib/mu_c": 0.8207534246575343,
"calib/mu_w": 0.5954285714285714,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22223107569721112,
"calib/std_conf": 0.3989124395483772,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4958344640434193,
"calib/step_q_c_n": 737.0,
"calib/step_q_gap": 0.09643618324112702,
"calib/step_q_w": 0.3993982808022923,
"calib/step_q_w_n": 698.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2559.0,
"completions/max_terminated_length": 2559.0,
"completions/mean_length": 551.88671875,
"completions/mean_terminated_length": 556.2322998046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.029080787673592567,
"kl": 0.065887451171875,
"learning_rate": 2.0833333333333334e-06,
"loss": -0.0555,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03245328366756439,
"mask/share_reasoning": 0.8474924564361572,
"mask/share_step_conf": 0.1122417151927948,
"num_tokens": 29336884.0,
"reward": 0.9050019383430481,
"reward_std": 0.20497554540634155,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.672819972038269,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8270276784896851,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.5700531005859375,
"adv/mean_abs_reasoning": 0.4702516198158264,
"adv/mean_abs_step_conf": 0.7739061117172241,
"adv/ratio_final_to_reasoning": 1.2122299563990833,
"adv/ratio_step_to_reasoning": 1.6457276894023751,
"adv/std_final_conf": 0.7908639311790466,
"adv/std_reasoning": 0.7206709384918213,
"adv/std_step_conf": 0.9327467083930969,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.8016865079365079,
"calib/avg_num_step_conf": 5.65234375,
"calib/ece": 0.20149797570850211,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5870445344129555,
"calib/gap": 0.46720899470899474,
"calib/mean_conf": 0.67,
"calib/mu_c": 0.8818518518518519,
"calib/mu_w": 0.41464285714285715,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16246963562753047,
"calib/std_conf": 0.4209455329379382,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.49954285714285723,
"calib/step_q_c_n": 700.0,
"calib/step_q_gap": 0.1847905144387073,
"calib/step_q_w": 0.3147523427041499,
"calib/step_q_w_n": 747.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2843.0,
"completions/max_terminated_length": 2843.0,
"completions/mean_length": 531.10546875,
"completions/mean_terminated_length": 539.5357666015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 68.0,
"epoch": 0.1344,
"grad_norm": 0.04010576009750366,
"kl": 0.077484130859375,
"learning_rate": 2.0555555555555555e-06,
"loss": -0.0943,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.035097844898700714,
"mask/share_reasoning": 0.8299784064292908,
"mask/share_step_conf": 0.11929875612258911,
"num_tokens": 29578311.0,
"reward": 0.9459041357040405,
"reward_std": 0.17431926727294922,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7634941339492798,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8298766613006592,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.5844104886054993,
"adv/mean_abs_reasoning": 0.42019808292388916,
"adv/mean_abs_step_conf": 0.7377088069915771,
"adv/ratio_final_to_reasoning": 1.3907976079732711,
"adv/ratio_step_to_reasoning": 1.7556215436737226,
"adv/std_final_conf": 0.8205244541168213,
"adv/std_reasoning": 0.7013723850250244,
"adv/std_step_conf": 0.9340208768844604,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7519230769230769,
"calib/avg_num_step_conf": 6.1484375,
"calib/ece": 0.2734959349593497,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6097560975609756,
"calib/gap": 0.32465119363395234,
"calib/mean_conf": 0.6983739837398374,
"calib/mu_c": 0.8514615384615385,
"calib/mu_w": 0.5268103448275862,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22170731707317082,
"calib/std_conf": 0.41161959307648976,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5116850828729281,
"calib/step_q_c_n": 724.0,
"calib/step_q_gap": 0.2000262593435163,
"calib/step_q_w": 0.3116588235294118,
"calib/step_q_w_n": 850.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2754.0,
"completions/max_terminated_length": 2754.0,
"completions/mean_length": 515.87890625,
"completions/mean_terminated_length": 524.0675048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.031249074265360832,
"kl": 0.06769561767578125,
"learning_rate": 2.027777777777778e-06,
"loss": -0.0101,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.033468492329120636,
"mask/share_reasoning": 0.8253788948059082,
"mask/share_step_conf": 0.12552762031555176,
"num_tokens": 29814048.0,
"reward": 0.901087760925293,
"reward_std": 0.19262036681175232,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.6864038705825806,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8220216631889343,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.6289718747138977,
"adv/mean_abs_reasoning": 0.5378081798553467,
"adv/mean_abs_step_conf": 0.7337651252746582,
"adv/ratio_final_to_reasoning": 1.169509684443757,
"adv/ratio_step_to_reasoning": 1.3643621513380806,
"adv/std_final_conf": 0.8287432789802551,
"adv/std_reasoning": 0.775564432144165,
"adv/std_step_conf": 0.9350315928459167,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7391482391482391,
"calib/avg_num_step_conf": 5.26171875,
"calib/ece": 0.2504526748971194,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.5967078189300411,
"calib/gap": 0.3706777231777232,
"calib/mean_conf": 0.6675720164609054,
"calib/mu_c": 0.8368939393939394,
"calib/mu_w": 0.46621621621621623,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.1874074074074075,
"calib/std_conf": 0.43235248674798726,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.5065650080256823,
"calib/step_q_c_n": 623.0,
"calib/step_q_gap": 0.19077771520800274,
"calib/step_q_w": 0.31578729281767953,
"calib/step_q_w_n": 724.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2343.0,
"completions/max_terminated_length": 2343.0,
"completions/mean_length": 509.5859375,
"completions/mean_terminated_length": 521.8160400390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.05047476291656494,
"kl": 0.07059478759765625,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.1925,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.03439555689692497,
"mask/share_reasoning": 0.8320193290710449,
"mask/share_step_conf": 0.11014766991138458,
"num_tokens": 30051166.0,
"reward": 0.8882846832275391,
"reward_std": 0.2592315077781677,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6846984624862671,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.8020271062850952,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.5699493885040283,
"adv/mean_abs_reasoning": 0.3200322091579437,
"adv/mean_abs_step_conf": 0.7701667547225952,
"adv/ratio_final_to_reasoning": 1.7809125837791668,
"adv/ratio_step_to_reasoning": 2.406528882667866,
"adv/std_final_conf": 0.8245267868041992,
"adv/std_reasoning": 0.6400149464607239,
"adv/std_step_conf": 0.9331631064414978,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7107128113389295,
"calib/avg_num_step_conf": 5.328125,
"calib/ece": 0.22188235294117642,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.7372549019607844,
"calib/gap": 0.277128113389294,
"calib/mean_conf": 0.8085490196078432,
"calib/mu_c": 0.9020118343195266,
"calib/mu_w": 0.6248837209302326,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18384313725490192,
"calib/std_conf": 0.3402211687645397,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5089479512735327,
"calib/step_q_c_n": 903.0,
"calib/step_q_gap": 0.07988070615422677,
"calib/step_q_w": 0.4290672451193059,
"calib/step_q_w_n": 461.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1474.0,
"completions/max_terminated_length": 1474.0,
"completions/mean_length": 449.76953125,
"completions/mean_terminated_length": 451.5333557128906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.1376,
"grad_norm": 0.03063477762043476,
"kl": 0.07349395751953125,
"learning_rate": 1.9722222222222224e-06,
"loss": -0.0181,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03671756386756897,
"mask/share_reasoning": 0.8324686288833618,
"mask/share_step_conf": 0.12690752744674683,
"num_tokens": 30268691.0,
"reward": 0.9700326919555664,
"reward_std": 0.16116727888584137,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.760378897190094,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8484363555908203,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.5980579853057861,
"adv/mean_abs_reasoning": 0.27735209465026855,
"adv/mean_abs_step_conf": 0.7554200887680054,
"adv/ratio_final_to_reasoning": 2.156313209243711,
"adv/ratio_step_to_reasoning": 2.72368625778927,
"adv/std_final_conf": 0.8363456130027771,
"adv/std_reasoning": 0.5726004242897034,
"adv/std_step_conf": 0.9338682293891907,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7849537685626227,
"calib/avg_num_step_conf": 4.9140625,
"calib/ece": 0.16988095238095235,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7023809523809523,
"calib/gap": 0.480421686746988,
"calib/mean_conf": 0.741468253968254,
"calib/mu_c": 0.905421686746988,
"calib/mu_w": 0.425,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12630952380952382,
"calib/std_conf": 0.4020747617938168,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5338902743142144,
"calib/step_q_c_n": 802.0,
"calib/step_q_gap": 0.09031571291070573,
"calib/step_q_w": 0.4435745614035087,
"calib/step_q_w_n": 456.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1697.0,
"completions/max_terminated_length": 1697.0,
"completions/mean_length": 450.4921875,
"completions/mean_terminated_length": 457.64288330078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.06043770909309387,
"kl": 0.0726165771484375,
"learning_rate": 1.944444444444445e-06,
"loss": -0.0455,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.037092648446559906,
"mask/share_reasoning": 0.8293949365615845,
"mask/share_step_conf": 0.11788740009069443,
"num_tokens": 30489305.0,
"reward": 0.9999578595161438,
"reward_std": 0.1384398490190506,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.8098331689834595,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8635199069976807,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.644719123840332,
"adv/mean_abs_reasoning": 0.30147671699523926,
"adv/mean_abs_step_conf": 0.7534003257751465,
"adv/ratio_final_to_reasoning": 2.1385370328631814,
"adv/ratio_step_to_reasoning": 2.499033203240845,
"adv/std_final_conf": 0.8671888113021851,
"adv/std_reasoning": 0.5959498882293701,
"adv/std_step_conf": 0.9341416954994202,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8221227621483376,
"calib/avg_num_step_conf": 5.1953125,
"calib/ece": 0.21262948207171306,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.49800796812749004,
"calib/gap": 0.4851259590792839,
"calib/mean_conf": 0.5955776892430279,
"calib/mu_c": 0.8584347826086957,
"calib/mu_w": 0.37330882352941175,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17501992031872504,
"calib/std_conf": 0.4372186231651954,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.553898916967509,
"calib/step_q_c_n": 554.0,
"calib/step_q_gap": 0.2200973705757565,
"calib/step_q_w": 0.3338015463917525,
"calib/step_q_w_n": 776.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2524.0,
"completions/max_terminated_length": 2524.0,
"completions/mean_length": 483.109375,
"completions/mean_terminated_length": 488.83795166015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.03974044695496559,
"kl": 0.07405853271484375,
"learning_rate": 1.916666666666667e-06,
"loss": -0.1017,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034373268485069275,
"mask/share_reasoning": 0.8387865424156189,
"mask/share_step_conf": 0.11512146890163422,
"num_tokens": 30719189.0,
"reward": 0.9495083093643188,
"reward_std": 0.16720762848854065,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.7672886848449707,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8457905054092407,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.6128841638565063,
"adv/mean_abs_reasoning": 0.540101945400238,
"adv/mean_abs_step_conf": 0.7529127597808838,
"adv/ratio_final_to_reasoning": 1.1347564456601498,
"adv/ratio_step_to_reasoning": 1.3940197145984061,
"adv/std_final_conf": 0.8160517811775208,
"adv/std_reasoning": 0.7577255964279175,
"adv/std_step_conf": 0.9345849752426147,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7750711478520125,
"calib/avg_num_step_conf": 5.6796875,
"calib/ece": 0.21195219123505982,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6414342629482072,
"calib/gap": 0.38740547499661204,
"calib/mean_conf": 0.7143426294820717,
"calib/mu_c": 0.8594267515923567,
"calib/mu_w": 0.47202127659574467,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15039840637450202,
"calib/std_conf": 0.40855574277647305,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.546679292929293,
"calib/step_q_c_n": 792.0,
"calib/step_q_gap": 0.17696630199273705,
"calib/step_q_w": 0.3697129909365559,
"calib/step_q_w_n": 662.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2861.0,
"completions/max_terminated_length": 2861.0,
"completions/mean_length": 516.65625,
"completions/mean_terminated_length": 520.7244262695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.1408,
"grad_norm": 0.030295290052890778,
"kl": 0.066131591796875,
"learning_rate": 1.888888888888889e-06,
"loss": 0.0593,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03516693413257599,
"mask/share_reasoning": 0.8338392972946167,
"mask/share_step_conf": 0.12318122386932373,
"num_tokens": 30957045.0,
"reward": 0.9622828364372253,
"reward_std": 0.20570440590381622,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7573515772819519,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8476827144622803,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.6930422782897949,
"adv/mean_abs_reasoning": 0.6090747117996216,
"adv/mean_abs_step_conf": 0.7614809274673462,
"adv/ratio_final_to_reasoning": 1.1378608647896018,
"adv/ratio_step_to_reasoning": 1.250225814198414,
"adv/std_final_conf": 0.8737168312072754,
"adv/std_reasoning": 0.826562762260437,
"adv/std_step_conf": 0.9344897866249084,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.775923016496465,
"calib/avg_num_step_conf": 6.046875,
"calib/ece": 0.25278225806451615,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5362903225806451,
"calib/gap": 0.3981657501963865,
"calib/mean_conf": 0.6040725806451613,
"calib/mu_c": 0.8192105263157895,
"calib/mu_w": 0.421044776119403,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19858870967741937,
"calib/std_conf": 0.4442454035811937,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5027538461538461,
"calib/step_q_c_n": 650.0,
"calib/step_q_gap": 0.16200774370395743,
"calib/step_q_w": 0.3407461024498887,
"calib/step_q_w_n": 898.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2803.0,
"completions/max_terminated_length": 2803.0,
"completions/mean_length": 598.63671875,
"completions/mean_terminated_length": 603.3504028320312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.04612388089299202,
"kl": 0.0608673095703125,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.0083,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.02898087352514267,
"mask/share_reasoning": 0.853644609451294,
"mask/share_step_conf": 0.10956203192472458,
"num_tokens": 31216640.0,
"reward": 0.9158010482788086,
"reward_std": 0.23849625885486603,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.7083597183227539,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8404297828674316,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.7420611381530762,
"adv/mean_abs_reasoning": 0.6442513465881348,
"adv/mean_abs_step_conf": 0.7690349817276001,
"adv/ratio_final_to_reasoning": 1.1518193048146945,
"adv/ratio_step_to_reasoning": 1.1936878142363256,
"adv/std_final_conf": 0.8912635445594788,
"adv/std_reasoning": 0.8429958820343018,
"adv/std_step_conf": 0.9348194003105164,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7227853099907858,
"calib/avg_num_step_conf": 5.8671875,
"calib/ece": 0.2561044176706827,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.606425702811245,
"calib/gap": 0.3689844675529814,
"calib/mean_conf": 0.6691164658634536,
"calib/mu_c": 0.8276760563380282,
"calib/mu_w": 0.4586915887850468,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.17746987951807228,
"calib/std_conf": 0.43159555300705044,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4647212336892052,
"calib/step_q_c_n": 843.0,
"calib/step_q_gap": 0.1383175918075663,
"calib/step_q_w": 0.3264036418816389,
"calib/step_q_w_n": 659.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2848.0,
"completions/max_terminated_length": 2848.0,
"completions/mean_length": 574.48046875,
"completions/mean_terminated_length": 581.2925415039062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.03696267306804657,
"kl": 0.05755615234375,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.1123,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.029521383345127106,
"mask/share_reasoning": 0.8535020351409912,
"mask/share_step_conf": 0.10525783896446228,
"num_tokens": 31472659.0,
"reward": 0.9212626218795776,
"reward_std": 0.26008397340774536,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7156097292900085,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8222278356552124,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.617477536201477,
"adv/mean_abs_reasoning": 0.4812614321708679,
"adv/mean_abs_step_conf": 0.7539912462234497,
"adv/ratio_final_to_reasoning": 1.2830397262796798,
"adv/ratio_step_to_reasoning": 1.5666978399294447,
"adv/std_final_conf": 0.8101888298988342,
"adv/std_reasoning": 0.7393453121185303,
"adv/std_step_conf": 0.9331625699996948,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7016129032258065,
"calib/avg_num_step_conf": 5.84765625,
"calib/ece": 0.23706827309236939,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6265060240963856,
"calib/gap": 0.34203267162944595,
"calib/mean_conf": 0.713855421686747,
"calib/mu_c": 0.8416025641025643,
"calib/mu_w": 0.49956989247311834,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16220883534136535,
"calib/std_conf": 0.40319655881577965,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.48586248492159223,
"calib/step_q_c_n": 829.0,
"calib/step_q_gap": 0.12367685617907725,
"calib/step_q_w": 0.362185628742515,
"calib/step_q_w_n": 668.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2690.0,
"completions/max_terminated_length": 2690.0,
"completions/mean_length": 527.8671875,
"completions/mean_terminated_length": 534.1265258789062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.144,
"grad_norm": 0.04049808531999588,
"kl": 0.06581878662109375,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.0241,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03396621719002724,
"mask/share_reasoning": 0.8378646969795227,
"mask/share_step_conf": 0.11645033210515976,
"num_tokens": 31713673.0,
"reward": 0.9581298828125,
"reward_std": 0.1770906001329422,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7313003540039062,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.86933434009552,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.6305532455444336,
"adv/mean_abs_reasoning": 0.45713546872138977,
"adv/mean_abs_step_conf": 0.7678828835487366,
"adv/ratio_final_to_reasoning": 1.3793575180418491,
"adv/ratio_step_to_reasoning": 1.6797709565099135,
"adv/std_final_conf": 0.8364690542221069,
"adv/std_reasoning": 0.7206043004989624,
"adv/std_step_conf": 0.9332603812217712,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7863091806009354,
"calib/avg_num_step_conf": 6.26953125,
"calib/ece": 0.23220000000000002,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.512,
"calib/gap": 0.4209782817605227,
"calib/mean_conf": 0.5798,
"calib/mu_c": 0.7970247933884297,
"calib/mu_w": 0.376046511627907,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.164,
"calib/std_conf": 0.4506994120253542,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4941863905325444,
"calib/step_q_c_n": 676.0,
"calib/step_q_gap": 0.2114737963452462,
"calib/step_q_w": 0.28271259418729816,
"calib/step_q_w_n": 929.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2788.0,
"completions/max_terminated_length": 2788.0,
"completions/mean_length": 545.4140625,
"completions/mean_terminated_length": 551.8814697265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.04689570143818855,
"kl": 0.0675506591796875,
"learning_rate": 1.777777777777778e-06,
"loss": -0.0301,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03165838494896889,
"mask/share_reasoning": 0.8355990648269653,
"mask/share_step_conf": 0.12102382630109787,
"num_tokens": 31961787.0,
"reward": 0.9410368204116821,
"reward_std": 0.18190747499465942,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.7306855320930481,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8615442514419556,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.6028730869293213,
"adv/mean_abs_reasoning": 0.43961775302886963,
"adv/mean_abs_step_conf": 0.7608221769332886,
"adv/ratio_final_to_reasoning": 1.3713574640142676,
"adv/ratio_step_to_reasoning": 1.7306447969659804,
"adv/std_final_conf": 0.8279385566711426,
"adv/std_reasoning": 0.7205258011817932,
"adv/std_step_conf": 0.933806300163269,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.831116158338741,
"calib/avg_num_step_conf": 6.3203125,
"calib/ece": 0.1833734939759037,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5502008032128514,
"calib/gap": 0.47973588578844906,
"calib/mean_conf": 0.6558232931726908,
"calib/mu_c": 0.8773880597014926,
"calib/mu_w": 0.3976521739130435,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15052208835341374,
"calib/std_conf": 0.4197037405347656,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.49825970548862114,
"calib/step_q_c_n": 747.0,
"calib/step_q_gap": 0.18530907403052693,
"calib/step_q_w": 0.3129506314580942,
"calib/step_q_w_n": 871.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2792.0,
"completions/max_terminated_length": 2792.0,
"completions/mean_length": 515.35546875,
"completions/mean_terminated_length": 521.4664306640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.03188343718647957,
"kl": 0.07834625244140625,
"learning_rate": 1.75e-06,
"loss": -0.0948,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03322169929742813,
"mask/share_reasoning": 0.8309173583984375,
"mask/share_step_conf": 0.12414221465587616,
"num_tokens": 32200702.0,
"reward": 0.9673022031784058,
"reward_std": 0.18526697158813477,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.778056263923645,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8573293685913086,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.6043561697006226,
"adv/mean_abs_reasoning": 0.4610038101673126,
"adv/mean_abs_step_conf": 0.7497239112854004,
"adv/ratio_final_to_reasoning": 1.3109569950870534,
"adv/ratio_step_to_reasoning": 1.6262857155417052,
"adv/std_final_conf": 0.8366493582725525,
"adv/std_reasoning": 0.7207646369934082,
"adv/std_step_conf": 0.9337040185928345,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7530487804878049,
"calib/avg_num_step_conf": 6.171875,
"calib/ece": 0.23643999999999998,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.628,
"calib/gap": 0.332033465683494,
"calib/mean_conf": 0.717,
"calib/mu_c": 0.8312195121951219,
"calib/mu_w": 0.4991860465116279,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14872000000000002,
"calib/std_conf": 0.4039220221775485,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4687580993520518,
"calib/step_q_c_n": 926.0,
"calib/step_q_gap": 0.1361128394132139,
"calib/step_q_w": 0.3326452599388379,
"calib/step_q_w_n": 654.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1269.0,
"completions/max_terminated_length": 1269.0,
"completions/mean_length": 494.53125,
"completions/mean_terminated_length": 498.4252014160156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.1472,
"grad_norm": 0.04823905602097511,
"kl": 0.07939910888671875,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.1478,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03357618302106857,
"mask/share_reasoning": 0.8358300924301147,
"mask/share_step_conf": 0.12278124690055847,
"num_tokens": 32431638.0,
"reward": 0.9525300860404968,
"reward_std": 0.2015206217765808,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7395683526992798,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8420543670654297,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.6356931924819946,
"adv/mean_abs_reasoning": 0.48741334676742554,
"adv/mean_abs_step_conf": 0.7561699151992798,
"adv/ratio_final_to_reasoning": 1.304217860873068,
"adv/ratio_step_to_reasoning": 1.551393535310994,
"adv/std_final_conf": 0.8453860282897949,
"adv/std_reasoning": 0.7575779557228088,
"adv/std_step_conf": 0.9334150552749634,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8276442307692307,
"calib/avg_num_step_conf": 5.5,
"calib/ece": 0.19976095617529877,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6374501992031872,
"calib/gap": 0.44426442307692326,
"calib/mean_conf": 0.7101195219123506,
"calib/mu_c": 0.8711875000000001,
"calib/mu_w": 0.42692307692307685,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.136215139442231,
"calib/std_conf": 0.40911768056534276,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5002891566265061,
"calib/step_q_c_n": 830.0,
"calib/step_q_gap": 0.1729535164880978,
"calib/step_q_w": 0.32733564013840827,
"calib/step_q_w_n": 578.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2796.0,
"completions/max_terminated_length": 2796.0,
"completions/mean_length": 474.73046875,
"completions/mean_terminated_length": 480.3597106933594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.03851529210805893,
"kl": 0.06740570068359375,
"learning_rate": 1.6944444444444446e-06,
"loss": -0.063,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03441895917057991,
"mask/share_reasoning": 0.8336876034736633,
"mask/share_step_conf": 0.12017463147640228,
"num_tokens": 32656265.0,
"reward": 0.9894264936447144,
"reward_std": 0.18618866801261902,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7859241962432861,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8718349933624268,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.5681322813034058,
"adv/mean_abs_reasoning": 0.41635167598724365,
"adv/mean_abs_step_conf": 0.7558364272117615,
"adv/ratio_final_to_reasoning": 1.364549043681074,
"adv/ratio_step_to_reasoning": 1.8153798118370468,
"adv/std_final_conf": 0.8017783761024475,
"adv/std_reasoning": 0.7013971209526062,
"adv/std_step_conf": 0.9335141777992249,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7918732782369146,
"calib/avg_num_step_conf": 5.4296875,
"calib/ece": 0.1861264822134387,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.691699604743083,
"calib/gap": 0.39036363636363636,
"calib/mean_conf": 0.7545849802371543,
"calib/mu_c": 0.8903636363636364,
"calib/mu_w": 0.5,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.14426877470355728,
"calib/std_conf": 0.38078596388028896,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5402469135802469,
"calib/step_q_c_n": 891.0,
"calib/step_q_gap": 0.19100843662633904,
"calib/step_q_w": 0.34923847695390786,
"calib/step_q_w_n": 499.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2622.0,
"completions/max_terminated_length": 2622.0,
"completions/mean_length": 501.6171875,
"completions/mean_terminated_length": 503.5843505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.04033830761909485,
"kl": 0.06805419921875,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0355,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0332154780626297,
"mask/share_reasoning": 0.8421204090118408,
"mask/share_step_conf": 0.1207578182220459,
"num_tokens": 32889695.0,
"reward": 0.9933497309684753,
"reward_std": 0.16882237792015076,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7854597568511963,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8746772408485413,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.5361478924751282,
"adv/mean_abs_reasoning": 0.3555518388748169,
"adv/mean_abs_step_conf": 0.7517731785774231,
"adv/ratio_final_to_reasoning": 1.5079317102446368,
"adv/ratio_step_to_reasoning": 2.1143841667546774,
"adv/std_final_conf": 0.7747222781181335,
"adv/std_reasoning": 0.640269935131073,
"adv/std_step_conf": 0.9338180422782898,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8649622022536014,
"calib/avg_num_step_conf": 5.93359375,
"calib/ece": 0.1573122529644269,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6758893280632411,
"calib/gap": 0.5345556981885607,
"calib/mean_conf": 0.7171541501976284,
"calib/mu_c": 0.8904093567251461,
"calib/mu_w": 0.35585365853658535,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.09928853754940715,
"calib/std_conf": 0.41866884376992447,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.49948293691830403,
"calib/step_q_c_n": 967.0,
"calib/step_q_gap": 0.1837401832951156,
"calib/step_q_w": 0.31574275362318843,
"calib/step_q_w_n": 552.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1824.0,
"completions/max_terminated_length": 1824.0,
"completions/mean_length": 539.8671875,
"completions/mean_terminated_length": 544.1181030273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.1504,
"grad_norm": 0.04087727516889572,
"kl": 0.06522369384765625,
"learning_rate": 1.638888888888889e-06,
"loss": -0.0535,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03152162581682205,
"mask/share_reasoning": 0.8432751893997192,
"mask/share_step_conf": 0.11739066243171692,
"num_tokens": 33134997.0,
"reward": 1.0191978216171265,
"reward_std": 0.1478888988494873,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.8283312320709229,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8795955181121826,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.5493594408035278,
"adv/mean_abs_reasoning": 0.35145801305770874,
"adv/mean_abs_step_conf": 0.7668730020523071,
"adv/ratio_final_to_reasoning": 1.563086970258732,
"adv/ratio_step_to_reasoning": 2.1819761495276766,
"adv/std_final_conf": 0.7770100831985474,
"adv/std_reasoning": 0.6403212547302246,
"adv/std_step_conf": 0.9339790344238281,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.833258137286497,
"calib/avg_num_step_conf": 6.30859375,
"calib/ece": 0.20769841269841277,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5714285714285714,
"calib/gap": 0.4770944247502418,
"calib/mean_conf": 0.6350793650793651,
"calib/mu_c": 0.8376551724137932,
"calib/mu_w": 0.3605607476635514,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1336904761904763,
"calib/std_conf": 0.44481804025835037,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5124548736462095,
"calib/step_q_c_n": 831.0,
"calib/step_q_gap": 0.19458497568702576,
"calib/step_q_w": 0.3178698979591837,
"calib/step_q_w_n": 784.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2385.0,
"completions/max_terminated_length": 2385.0,
"completions/mean_length": 548.44140625,
"completions/mean_terminated_length": 554.9447021484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.0574471652507782,
"kl": 0.06587982177734375,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.0563,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.030428925529122353,
"mask/share_reasoning": 0.8384536504745483,
"mask/share_step_conf": 0.11939871311187744,
"num_tokens": 33380558.0,
"reward": 0.9693418741226196,
"reward_std": 0.16276447474956512,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7750797271728516,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.853447675704956,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.6312240362167358,
"adv/mean_abs_reasoning": 0.5305913686752319,
"adv/mean_abs_step_conf": 0.7272872924804688,
"adv/ratio_final_to_reasoning": 1.189661335412902,
"adv/ratio_step_to_reasoning": 1.3707107492086472,
"adv/std_final_conf": 0.8579843640327454,
"adv/std_reasoning": 0.7754026055335999,
"adv/std_step_conf": 0.9342260360717773,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8851172404883405,
"calib/avg_num_step_conf": 5.9765625,
"calib/ece": 0.13715999999999995,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.52,
"calib/gap": 0.5812705897551838,
"calib/mean_conf": 0.6042000000000001,
"calib/mu_c": 0.866934306569343,
"calib/mu_w": 0.2856637168141593,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.09667999999999996,
"calib/std_conf": 0.43694892149998493,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.48222695035460994,
"calib/step_q_c_n": 705.0,
"calib/step_q_gap": 0.20965725338491298,
"calib/step_q_w": 0.27256969696969696,
"calib/step_q_w_n": 825.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2965.0,
"completions/max_terminated_length": 2965.0,
"completions/mean_length": 567.5703125,
"completions/mean_terminated_length": 569.796142578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.03865446522831917,
"kl": 0.06455230712890625,
"learning_rate": 1.5833333333333333e-06,
"loss": 0.0308,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.030564136803150177,
"mask/share_reasoning": 0.847449541091919,
"mask/share_step_conf": 0.11808009445667267,
"num_tokens": 33633192.0,
"reward": 0.9973582029342651,
"reward_std": 0.20891296863555908,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.8224399089813232,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8707141280174255,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.6034372448921204,
"adv/mean_abs_reasoning": 0.48929351568222046,
"adv/mean_abs_step_conf": 0.7381302118301392,
"adv/ratio_final_to_reasoning": 1.2332827342924209,
"adv/ratio_step_to_reasoning": 1.5085632410251062,
"adv/std_final_conf": 0.8320863246917725,
"adv/std_reasoning": 0.75753253698349,
"adv/std_step_conf": 0.9336531758308411,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7241117522016398,
"calib/avg_num_step_conf": 5.62109375,
"calib/ece": 0.25031746031746027,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5634920634920635,
"calib/gap": 0.38860917096872155,
"calib/mean_conf": 0.6234126984126984,
"calib/mu_c": 0.7375280898876404,
"calib/mu_w": 0.34891891891891885,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.08369047619047619,
"calib/std_conf": 0.4471473296822033,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.47799794661190964,
"calib/step_q_c_n": 974.0,
"calib/step_q_gap": 0.16720224768717845,
"calib/step_q_w": 0.3107956989247312,
"calib/step_q_w_n": 465.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1985.0,
"completions/max_terminated_length": 1985.0,
"completions/mean_length": 508.921875,
"completions/mean_terminated_length": 510.91766357421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.1536,
"grad_norm": 0.0469198077917099,
"kl": 0.096343994140625,
"learning_rate": 1.5555555555555558e-06,
"loss": 0.0299,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03401659429073334,
"mask/share_reasoning": 0.8417648077011108,
"mask/share_step_conf": 0.1203123927116394,
"num_tokens": 33867604.0,
"reward": 0.9624280333518982,
"reward_std": 0.16957436501979828,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.7329937219619751,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8567060828208923,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.6340999007225037,
"adv/mean_abs_reasoning": 0.4605696201324463,
"adv/mean_abs_step_conf": 0.7574340105056763,
"adv/ratio_final_to_reasoning": 1.3767731804372054,
"adv/ratio_step_to_reasoning": 1.6445592097191746,
"adv/std_final_conf": 0.8457151055335999,
"adv/std_reasoning": 0.7392440438270569,
"adv/std_step_conf": 0.9339003562927246,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7269218651543794,
"calib/avg_num_step_conf": 6.28515625,
"calib/ece": 0.24047430830039523,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5573122529644269,
"calib/gap": 0.3170833333333332,
"calib/mean_conf": 0.6568379446640317,
"calib/mu_c": 0.7433152173913042,
"calib/mu_w": 0.426231884057971,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.08501976284584978,
"calib/std_conf": 0.4133133751405631,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4666238532110092,
"calib/step_q_c_n": 1090.0,
"calib/step_q_gap": 0.12361807286418836,
"calib/step_q_w": 0.34300578034682083,
"calib/step_q_w_n": 519.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1685.0,
"completions/max_terminated_length": 1685.0,
"completions/mean_length": 491.21484375,
"completions/mean_terminated_length": 495.0826721191406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.05354519188404083,
"kl": 0.06645965576171875,
"learning_rate": 1.527777777777778e-06,
"loss": -0.0135,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.034240782260894775,
"mask/share_reasoning": 0.8234318494796753,
"mask/share_step_conf": 0.1345149278640747,
"num_tokens": 34096059.0,
"reward": 0.9719904661178589,
"reward_std": 0.17904748022556305,
"rewards/accuracy_reward_step": 0.71875,
"rewards/final_brier_reward_step": 0.7428406476974487,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.859734058380127,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.6603371500968933,
"adv/mean_abs_reasoning": 0.4713112711906433,
"adv/mean_abs_step_conf": 0.7333450317382812,
"adv/ratio_final_to_reasoning": 1.401063777721093,
"adv/ratio_step_to_reasoning": 1.5559675241495476,
"adv/std_final_conf": 0.8613228797912598,
"adv/std_reasoning": 0.7393918037414551,
"adv/std_step_conf": 0.9345128536224365,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7508116883116884,
"calib/avg_num_step_conf": 5.953125,
"calib/ece": 0.23447999999999997,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.4116038961038961,
"calib/mean_conf": 0.5883200000000001,
"calib/mu_c": 0.8188181818181818,
"calib/mu_w": 0.4072142857142857,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.19139999999999996,
"calib/std_conf": 0.43612702002971565,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4818808777429467,
"calib/step_q_c_n": 638.0,
"calib/step_q_gap": 0.15795311250592636,
"calib/step_q_w": 0.3239277652370203,
"calib/step_q_w_n": 886.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2879.0,
"completions/max_terminated_length": 2879.0,
"completions/mean_length": 535.0390625,
"completions/mean_terminated_length": 541.3834228515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.04511573165655136,
"kl": 0.0630950927734375,
"learning_rate": 1.5e-06,
"loss": -0.0597,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.031575046479701996,
"mask/share_reasoning": 0.8401821851730347,
"mask/share_step_conf": 0.11652399599552155,
"num_tokens": 34340245.0,
"reward": 0.9193023443222046,
"reward_std": 0.20384211838245392,
"rewards/accuracy_reward_step": 0.4296875,
"rewards/final_brier_reward_step": 0.7231277227401733,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8350081443786621,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.5824941992759705,
"adv/mean_abs_reasoning": 0.3738293945789337,
"adv/mean_abs_step_conf": 0.7618895173072815,
"adv/ratio_final_to_reasoning": 1.5581819079049906,
"adv/ratio_step_to_reasoning": 2.0380674402702947,
"adv/std_final_conf": 0.8140949606895447,
"adv/std_reasoning": 0.6403229832649231,
"adv/std_step_conf": 0.9339715242385864,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7655609631147541,
"calib/avg_num_step_conf": 6.17578125,
"calib/ece": 0.27816,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.588,
"calib/gap": 0.3858299180327869,
"calib/mean_conf": 0.64016,
"calib/mu_c": 0.8377049180327869,
"calib/mu_w": 0.451875,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.21516000000000002,
"calib/std_conf": 0.4442692588959988,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5155219780219781,
"calib/step_q_c_n": 728.0,
"calib/step_q_gap": 0.17739771073006716,
"calib/step_q_w": 0.3381242672919109,
"calib/step_q_w_n": 853.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2620.0,
"completions/max_terminated_length": 2620.0,
"completions/mean_length": 548.27734375,
"completions/mean_terminated_length": 554.7786865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.1568,
"grad_norm": 0.04079528525471687,
"kl": 0.0682525634765625,
"learning_rate": 1.4722222222222225e-06,
"loss": -0.0792,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.030725400894880295,
"mask/share_reasoning": 0.8386132121086121,
"mask/share_step_conf": 0.11894263327121735,
"num_tokens": 34584284.0,
"reward": 0.9176037311553955,
"reward_std": 0.18044179677963257,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.7054883241653442,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8390940427780151,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.5602353811264038,
"adv/mean_abs_reasoning": 0.43357449769973755,
"adv/mean_abs_step_conf": 0.7547708749771118,
"adv/ratio_final_to_reasoning": 1.2921317653566942,
"adv/ratio_step_to_reasoning": 1.7408101237075335,
"adv/std_final_conf": 0.8055301308631897,
"adv/std_reasoning": 0.7204562425613403,
"adv/std_step_conf": 0.9323009848594666,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8212683924777715,
"calib/avg_num_step_conf": 5.703125,
"calib/ece": 0.17796000000000006,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.692,
"calib/gap": 0.45802423479424037,
"calib/mean_conf": 0.73372,
"calib/mu_c": 0.8637988826815642,
"calib/mu_w": 0.40577464788732387,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.09784000000000004,
"calib/std_conf": 0.40944274520377083,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5178674948240165,
"calib/step_q_c_n": 966.0,
"calib/step_q_gap": 0.18732093611956302,
"calib/step_q_w": 0.3305465587044535,
"calib/step_q_w_n": 494.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2539.0,
"completions/max_terminated_length": 2539.0,
"completions/mean_length": 519.296875,
"completions/mean_terminated_length": 521.3333740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.056484851986169815,
"kl": 0.07324981689453125,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.0459,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03550893813371658,
"mask/share_reasoning": 0.8370537161827087,
"mask/share_step_conf": 0.12353110313415527,
"num_tokens": 34822336.0,
"reward": 0.9942589998245239,
"reward_std": 0.1536797285079956,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.7958706617355347,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8574910163879395,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.5460543632507324,
"adv/mean_abs_reasoning": 0.4494268000125885,
"adv/mean_abs_step_conf": 0.7579556107521057,
"adv/ratio_final_to_reasoning": 1.2150017827940776,
"adv/ratio_step_to_reasoning": 1.6864940202294907,
"adv/std_final_conf": 0.7671118378639221,
"adv/std_reasoning": 0.681749701499939,
"adv/std_step_conf": 0.9289894700050354,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.8942547315956085,
"calib/avg_num_step_conf": 6.14453125,
"calib/ece": 0.1483870967741935,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6330645161290323,
"calib/gap": 0.5755102040816326,
"calib/mean_conf": 0.7011290322580644,
"calib/mu_c": 0.9355102040816325,
"calib/mu_w": 0.36,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12838709677419352,
"calib/std_conf": 0.41539664231605267,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5407525510204082,
"calib/step_q_c_n": 784.0,
"calib/step_q_gap": 0.2292443127441091,
"calib/step_q_w": 0.3115082382762991,
"calib/step_q_w_n": 789.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2963.0,
"completions/max_terminated_length": 2963.0,
"completions/mean_length": 560.36328125,
"completions/mean_terminated_length": 564.7755737304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.04593983665108681,
"kl": 0.0635223388671875,
"learning_rate": 1.4166666666666667e-06,
"loss": 0.0619,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03306441754102707,
"mask/share_reasoning": 0.83942711353302,
"mask/share_step_conf": 0.11969595402479172,
"num_tokens": 35070245.0,
"reward": 1.0043818950653076,
"reward_std": 0.1608709990978241,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.8255242109298706,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8746458292007446,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.5928109884262085,
"adv/mean_abs_reasoning": 0.5092207193374634,
"adv/mean_abs_step_conf": 0.7641928791999817,
"adv/ratio_final_to_reasoning": 1.1641533148877028,
"adv/ratio_step_to_reasoning": 1.50071049778622,
"adv/std_final_conf": 0.843606173992157,
"adv/std_reasoning": 0.7753878235816956,
"adv/std_step_conf": 0.9338046908378601,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.8056117290192113,
"calib/avg_num_step_conf": 5.6171875,
"calib/ece": 0.2101417004048584,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6761133603238867,
"calib/gap": 0.3927350859453996,
"calib/mean_conf": 0.7373886639676113,
"calib/mu_c": 0.8741304347826088,
"calib/mu_w": 0.4813953488372092,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14785425101214586,
"calib/std_conf": 0.40214979327082795,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5194275240384616,
"calib/step_q_c_n": 832.0,
"calib/step_q_gap": 0.1665397352595837,
"calib/step_q_w": 0.3528877887788779,
"calib/step_q_w_n": 606.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3023.0,
"completions/max_terminated_length": 3023.0,
"completions/mean_length": 512.07421875,
"completions/mean_terminated_length": 512.07421875,
"completions/min_length": 103.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.16,
"grad_norm": 0.06127196177840233,
"kl": 0.076141357421875,
"learning_rate": 1.3888888888888892e-06,
"loss": 0.0391,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03732970356941223,
"mask/share_reasoning": 0.832332968711853,
"mask/share_step_conf": 0.13033737242221832,
"num_tokens": 35306296.0,
"reward": 0.9579633474349976,
"reward_std": 0.18849684298038483,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7547647356987,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.842411994934082,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.655048668384552,
"adv/mean_abs_reasoning": 0.46310627460479736,
"adv/mean_abs_step_conf": 0.7712806463241577,
"adv/ratio_final_to_reasoning": 1.4144672709164936,
"adv/ratio_step_to_reasoning": 1.6654506505711852,
"adv/std_final_conf": 0.8607724905014038,
"adv/std_reasoning": 0.7207038402557373,
"adv/std_step_conf": 0.9343538284301758,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7660818713450293,
"calib/avg_num_step_conf": 5.66796875,
"calib/ece": 0.23698795180722892,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.570281124497992,
"calib/gap": 0.40570565302144246,
"calib/mean_conf": 0.6256626506024096,
"calib/mu_c": 0.8114074074074074,
"calib/mu_w": 0.4057017543859649,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1602409638554217,
"calib/std_conf": 0.44182951000839615,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4922701949860725,
"calib/step_q_c_n": 718.0,
"calib/step_q_gap": 0.15772722090694558,
"calib/step_q_w": 0.3345429740791269,
"calib/step_q_w_n": 733.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2932.0,
"completions/max_terminated_length": 2932.0,
"completions/mean_length": 553.15234375,
"completions/mean_terminated_length": 557.5078735351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.026984497904777527,
"kl": 0.058818817138671875,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.0642,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.032044894993305206,
"mask/share_reasoning": 0.8480304479598999,
"mask/share_step_conf": 0.11211220920085907,
"num_tokens": 35554927.0,
"reward": 0.9289988875389099,
"reward_std": 0.19891154766082764,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7304683923721313,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8267481327056885,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.6867839097976685,
"adv/mean_abs_reasoning": 0.5296306610107422,
"adv/mean_abs_step_conf": 0.7530745267868042,
"adv/ratio_final_to_reasoning": 1.2967223394639134,
"adv/ratio_step_to_reasoning": 1.4218861977319135,
"adv/std_final_conf": 0.8762410283088684,
"adv/std_reasoning": 0.7928329706192017,
"adv/std_step_conf": 0.9337801337242126,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7715010683760684,
"calib/avg_num_step_conf": 6.03125,
"calib/ece": 0.24040816326530617,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.563265306122449,
"calib/gap": 0.3778725961538462,
"calib/mean_conf": 0.651265306122449,
"calib/mu_c": 0.8317187500000001,
"calib/mu_w": 0.45384615384615384,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.18461224489795924,
"calib/std_conf": 0.4215293233323014,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.49577348066298343,
"calib/step_q_c_n": 724.0,
"calib/step_q_gap": 0.13735884651664193,
"calib/step_q_w": 0.3584146341463415,
"calib/step_q_w_n": 820.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2585.0,
"completions/max_terminated_length": 2585.0,
"completions/mean_length": 561.15234375,
"completions/mean_terminated_length": 565.5708618164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.049046602100133896,
"kl": 0.06014251708984375,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.0531,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03138267248868942,
"mask/share_reasoning": 0.8373855948448181,
"mask/share_step_conf": 0.12341928482055664,
"num_tokens": 35803974.0,
"reward": 0.9200597405433655,
"reward_std": 0.22718459367752075,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.7127765417098999,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8359366655349731,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.6593279838562012,
"adv/mean_abs_reasoning": 0.46595901250839233,
"adv/mean_abs_step_conf": 0.7748792767524719,
"adv/ratio_final_to_reasoning": 1.4149913751144068,
"adv/ratio_step_to_reasoning": 1.6629773348112151,
"adv/std_final_conf": 0.8450661301612854,
"adv/std_reasoning": 0.7206805944442749,
"adv/std_step_conf": 0.9337031245231628,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.6682795698924732,
"calib/avg_num_step_conf": 6.5625,
"calib/ece": 0.2902057613168725,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.6008230452674898,
"calib/gap": 0.26399354838709677,
"calib/mean_conf": 0.6720987654320988,
"calib/mu_c": 0.7731333333333333,
"calib/mu_w": 0.5091397849462366,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1725102880658437,
"calib/std_conf": 0.42473341738330084,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5326021798365123,
"calib/step_q_c_n": 734.0,
"calib/step_q_gap": 0.25974805721494776,
"calib/step_q_w": 0.2728541226215645,
"calib/step_q_w_n": 946.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2562.0,
"completions/max_terminated_length": 2562.0,
"completions/mean_length": 503.84765625,
"completions/mean_terminated_length": 526.4693603515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.1632,
"grad_norm": 0.0726659968495369,
"kl": 0.0702667236328125,
"learning_rate": 1.3055555555555556e-06,
"loss": -0.1991,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.030434779822826385,
"mask/share_reasoning": 0.8228424787521362,
"mask/share_step_conf": 0.10375404357910156,
"num_tokens": 36040279.0,
"reward": 0.8930153846740723,
"reward_std": 0.1965997815132141,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6692812442779541,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8097181916236877,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.620254397392273,
"adv/mean_abs_reasoning": 0.4352233409881592,
"adv/mean_abs_step_conf": 0.7530338764190674,
"adv/ratio_final_to_reasoning": 1.4251404715197658,
"adv/ratio_step_to_reasoning": 1.730224014891597,
"adv/std_final_conf": 0.8530692458152771,
"adv/std_reasoning": 0.7013433575630188,
"adv/std_step_conf": 0.93345046043396,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7079229253878355,
"calib/avg_num_step_conf": 5.10546875,
"calib/ece": 0.28772549019607846,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6274509803921569,
"calib/gap": 0.3493671509480424,
"calib/mean_conf": 0.685607843137255,
"calib/mu_c": 0.8554961832061069,
"calib/mu_w": 0.5061290322580645,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2298039215686275,
"calib/std_conf": 0.4278366959131226,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5391284403669725,
"calib/step_q_c_n": 654.0,
"calib/step_q_gap": 0.15605033929499695,
"calib/step_q_w": 0.38307810107197554,
"calib/step_q_w_n": 653.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2117.0,
"completions/max_terminated_length": 2117.0,
"completions/mean_length": 478.34765625,
"completions/mean_terminated_length": 478.34765625,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.043535955250263214,
"kl": 0.071868896484375,
"learning_rate": 1.2777777777777779e-06,
"loss": 0.0157,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03501860797405243,
"mask/share_reasoning": 0.8479246497154236,
"mask/share_step_conf": 0.1170567199587822,
"num_tokens": 36267176.0,
"reward": 0.9365236163139343,
"reward_std": 0.187381774187088,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7093707323074341,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8621140122413635,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.675777792930603,
"adv/mean_abs_reasoning": 0.4805974066257477,
"adv/mean_abs_step_conf": 0.7654271125793457,
"adv/ratio_final_to_reasoning": 1.406120348578674,
"adv/ratio_step_to_reasoning": 1.5926576007835214,
"adv/std_final_conf": 0.857191264629364,
"adv/std_reasoning": 0.7392981648445129,
"adv/std_step_conf": 0.9342840313911438,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7550637659414854,
"calib/avg_num_step_conf": 5.55859375,
"calib/ece": 0.2551383399209485,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5652173913043478,
"calib/gap": 0.4017823205801449,
"calib/mean_conf": 0.6168774703557313,
"calib/mu_c": 0.813798449612403,
"calib/mu_w": 0.4120161290322581,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18106719367588922,
"calib/std_conf": 0.4491829789994829,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4879407407407408,
"calib/step_q_c_n": 675.0,
"calib/step_q_gap": 0.15923753218459102,
"calib/step_q_w": 0.32870320855614976,
"calib/step_q_w_n": 748.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2004.0,
"completions/max_terminated_length": 2004.0,
"completions/mean_length": 485.56640625,
"completions/mean_terminated_length": 487.4706115722656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.048178721219301224,
"kl": 0.0730438232421875,
"learning_rate": 1.25e-06,
"loss": 0.0136,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0337291955947876,
"mask/share_reasoning": 0.8391321897506714,
"mask/share_step_conf": 0.12323231995105743,
"num_tokens": 36498697.0,
"reward": 0.9229997396469116,
"reward_std": 0.18754538893699646,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.729051947593689,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.818510115146637,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.6232823133468628,
"adv/mean_abs_reasoning": 0.49731171131134033,
"adv/mean_abs_step_conf": 0.7660905718803406,
"adv/ratio_final_to_reasoning": 1.2533031078302095,
"adv/ratio_step_to_reasoning": 1.5404635653165468,
"adv/std_final_conf": 0.8286430835723877,
"adv/std_reasoning": 0.7393476366996765,
"adv/std_step_conf": 0.9338200688362122,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7351076476026944,
"calib/avg_num_step_conf": 6.0625,
"calib/ece": 0.2925101214574899,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6720647773279352,
"calib/gap": 0.29858605204068167,
"calib/mean_conf": 0.7248178137651821,
"calib/mu_c": 0.8614179104477613,
"calib/mu_w": 0.5628318584070796,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2374089068825911,
"calib/std_conf": 0.4067893784099128,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.511150442477876,
"calib/step_q_c_n": 791.0,
"calib/step_q_gap": 0.13062481829916378,
"calib/step_q_w": 0.3805256241787122,
"calib/step_q_w_n": 761.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2771.0,
"completions/max_terminated_length": 2771.0,
"completions/mean_length": 555.74609375,
"completions/mean_terminated_length": 560.1220703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.1664,
"grad_norm": 0.037357404828071594,
"kl": 0.06532669067382812,
"learning_rate": 1.2222222222222223e-06,
"loss": 0.0328,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03344070166349411,
"mask/share_reasoning": 0.8330366611480713,
"mask/share_step_conf": 0.1257101595401764,
"num_tokens": 36745728.0,
"reward": 0.8930503129959106,
"reward_std": 0.21639417111873627,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6766519546508789,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8117923736572266,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.5315195322036743,
"adv/mean_abs_reasoning": 0.46138864755630493,
"adv/mean_abs_step_conf": 0.7667800188064575,
"adv/ratio_final_to_reasoning": 1.1519995886739087,
"adv/ratio_step_to_reasoning": 1.6618961538555943,
"adv/std_final_conf": 0.7801711559295654,
"adv/std_reasoning": 0.7206206321716309,
"adv/std_step_conf": 0.9337723851203918,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8493701055498808,
"calib/avg_num_step_conf": 6.1328125,
"calib/ece": 0.15578740157480314,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6456692913385826,
"calib/gap": 0.543925774599932,
"calib/mean_conf": 0.6954724409448819,
"calib/mu_c": 0.8860606060606062,
"calib/mu_w": 0.3421348314606742,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.10082677165354331,
"calib/std_conf": 0.4248607613834598,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5660838445807772,
"calib/step_q_c_n": 978.0,
"calib/step_q_gap": 0.22544195268888534,
"calib/step_q_w": 0.34064189189189187,
"calib/step_q_w_n": 592.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2705.0,
"completions/max_terminated_length": 2705.0,
"completions/mean_length": 521.609375,
"completions/mean_terminated_length": 523.6549072265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.034724507480859756,
"kl": 0.068572998046875,
"learning_rate": 1.1944444444444446e-06,
"loss": 0.0043,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.033151306211948395,
"mask/share_reasoning": 0.8360445499420166,
"mask/share_step_conf": 0.1268979161977768,
"num_tokens": 36982988.0,
"reward": 1.0144009590148926,
"reward_std": 0.16522559523582458,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.8308441638946533,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8706140518188477,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.5890312194824219,
"adv/mean_abs_reasoning": 0.42107200622558594,
"adv/mean_abs_step_conf": 0.7548243403434753,
"adv/ratio_final_to_reasoning": 1.3988847768874313,
"adv/ratio_step_to_reasoning": 1.7926253210456462,
"adv/std_final_conf": 0.7838677763938904,
"adv/std_reasoning": 0.6816832423210144,
"adv/std_step_conf": 0.9335840344429016,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6977124183006537,
"calib/avg_num_step_conf": 5.88671875,
"calib/ece": 0.2811904761904762,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7698412698412699,
"calib/gap": 0.25526274509803903,
"calib/mean_conf": 0.8084126984126985,
"calib/mu_c": 0.9117333333333333,
"calib/mu_w": 0.6564705882352943,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2471825396825397,
"calib/std_conf": 0.3575321335284662,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.564733581164808,
"calib/step_q_c_n": 807.0,
"calib/step_q_gap": 0.16187643830766513,
"calib/step_q_w": 0.40285714285714286,
"calib/step_q_w_n": 700.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2211.0,
"completions/max_terminated_length": 2211.0,
"completions/mean_length": 488.40625,
"completions/mean_terminated_length": 494.1976623535156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.030512472614645958,
"kl": 0.068450927734375,
"learning_rate": 1.1666666666666668e-06,
"loss": -0.0752,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03567176312208176,
"mask/share_reasoning": 0.8231402039527893,
"mask/share_step_conf": 0.12946924567222595,
"num_tokens": 37213260.0,
"reward": 0.9132636189460754,
"reward_std": 0.1889183521270752,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6938175559043884,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8194283843040466,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.5502872467041016,
"adv/mean_abs_reasoning": 0.4450033903121948,
"adv/mean_abs_step_conf": 0.7452594637870789,
"adv/ratio_final_to_reasoning": 1.2365911332002306,
"adv/ratio_step_to_reasoning": 1.6747276088486371,
"adv/std_final_conf": 0.8012663125991821,
"adv/std_reasoning": 0.7205966711044312,
"adv/std_step_conf": 0.9334879517555237,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.74140522875817,
"calib/avg_num_step_conf": 5.42578125,
"calib/ece": 0.20841269841269838,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.4341176470588235,
"calib/mean_conf": 0.705952380952381,
"calib/mu_c": 0.8816666666666667,
"calib/mu_w": 0.4475490196078432,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15956349206349202,
"calib/std_conf": 0.4222185896132073,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.555981308411215,
"calib/step_q_c_n": 749.0,
"calib/step_q_gap": 0.19538755841121497,
"calib/step_q_w": 0.36059375,
"calib/step_q_w_n": 640.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2803.0,
"completions/max_terminated_length": 2803.0,
"completions/mean_length": 498.8515625,
"completions/mean_terminated_length": 500.807861328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.1696,
"grad_norm": 0.040636204183101654,
"kl": 0.071807861328125,
"learning_rate": 1.138888888888889e-06,
"loss": 0.0685,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03515962138772011,
"mask/share_reasoning": 0.8426576852798462,
"mask/share_step_conf": 0.11827646195888519,
"num_tokens": 37445750.0,
"reward": 0.9766671061515808,
"reward_std": 0.16175855696201324,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7655757665634155,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8736958503723145,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.6365013122558594,
"adv/mean_abs_reasoning": 0.4862366318702698,
"adv/mean_abs_step_conf": 0.7508884072303772,
"adv/ratio_final_to_reasoning": 1.3090361164431579,
"adv/ratio_step_to_reasoning": 1.5442859669830835,
"adv/std_final_conf": 0.8413640856742859,
"adv/std_reasoning": 0.7576702237129211,
"adv/std_step_conf": 0.9344907402992249,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7269791666666666,
"calib/avg_num_step_conf": 5.83203125,
"calib/ece": 0.2636585365853658,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6097560975609756,
"calib/gap": 0.34445416666666673,
"calib/mean_conf": 0.6595121951219513,
"calib/mu_c": 0.7939333333333334,
"calib/mu_w": 0.44947916666666665,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15670731707317068,
"calib/std_conf": 0.44032388976758857,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5100624219725343,
"calib/step_q_c_n": 801.0,
"calib/step_q_gap": 0.17267802890895045,
"calib/step_q_w": 0.3373843930635838,
"calib/step_q_w_n": 692.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2923.0,
"completions/max_terminated_length": 2923.0,
"completions/mean_length": 522.58984375,
"completions/mean_terminated_length": 530.8849487304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.04386411979794502,
"kl": 0.06703948974609375,
"learning_rate": 1.111111111111111e-06,
"loss": -0.0188,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03261594846844673,
"mask/share_reasoning": 0.8347195386886597,
"mask/share_step_conf": 0.11703953146934509,
"num_tokens": 37684373.0,
"reward": 0.9110732674598694,
"reward_std": 0.23112061619758606,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7011132836341858,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8116582036018372,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.5453575253486633,
"adv/mean_abs_reasoning": 0.4391787052154541,
"adv/mean_abs_step_conf": 0.7485448718070984,
"adv/ratio_final_to_reasoning": 1.2417667771052778,
"adv/ratio_step_to_reasoning": 1.7044197792784015,
"adv/std_final_conf": 0.7737199664115906,
"adv/std_reasoning": 0.7206230759620667,
"adv/std_step_conf": 0.9342144131660461,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7818750954344175,
"calib/avg_num_step_conf": 5.546875,
"calib/ece": 0.18187250996015936,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6573705179282868,
"calib/gap": 0.44304168575355013,
"calib/mean_conf": 0.7197211155378486,
"calib/mu_c": 0.8503389830508474,
"calib/mu_w": 0.4072972972972973,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.09820717131474105,
"calib/std_conf": 0.4102000312649325,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5249844559585491,
"calib/step_q_c_n": 965.0,
"calib/step_q_gap": 0.14935808233217557,
"calib/step_q_w": 0.3756263736263736,
"calib/step_q_w_n": 455.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1989.0,
"completions/max_terminated_length": 1989.0,
"completions/mean_length": 472.8984375,
"completions/mean_terminated_length": 480.40478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.043638553470373154,
"kl": 0.06868743896484375,
"learning_rate": 1.0833333333333335e-06,
"loss": -0.0168,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03504636883735657,
"mask/share_reasoning": 0.8246728777885437,
"mask/share_step_conf": 0.12465573847293854,
"num_tokens": 37909355.0,
"reward": 0.9816228747367859,
"reward_std": 0.18595723807811737,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.7881566286087036,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8414953947067261,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.5180578827857971,
"adv/mean_abs_reasoning": 0.31734955310821533,
"adv/mean_abs_step_conf": 0.7810468077659607,
"adv/ratio_final_to_reasoning": 1.6324519058299753,
"adv/ratio_step_to_reasoning": 2.461156160820639,
"adv/std_final_conf": 0.762174665927887,
"adv/std_reasoning": 0.5960145592689514,
"adv/std_step_conf": 0.9328275322914124,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7803224056420988,
"calib/avg_num_step_conf": 5.2265625,
"calib/ece": 0.15679687500000003,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.73046875,
"calib/gap": 0.45043788266294665,
"calib/mean_conf": 0.780625,
"calib/mu_c": 0.9020320855614974,
"calib/mu_w": 0.45159420289855073,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10347656250000004,
"calib/std_conf": 0.37814772658975493,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5128571428571428,
"calib/step_q_c_n": 959.0,
"calib/step_q_gap": 0.0898228420655861,
"calib/step_q_w": 0.4230343007915567,
"calib/step_q_w_n": 379.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 899.0,
"completions/max_terminated_length": 899.0,
"completions/mean_length": 433.8125,
"completions/mean_terminated_length": 435.5137634277344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.1728,
"grad_norm": 0.04205428436398506,
"kl": 0.09763336181640625,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.102,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03682439774274826,
"mask/share_reasoning": 0.8325048685073853,
"mask/share_step_conf": 0.1267644464969635,
"num_tokens": 38124555.0,
"reward": 1.028838038444519,
"reward_std": 0.14918027818202972,
"rewards/accuracy_reward_step": 0.73046875,
"rewards/final_brier_reward_step": 0.834972620010376,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8766096830368042,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.5653954148292542,
"adv/mean_abs_reasoning": 0.4558749794960022,
"adv/mean_abs_step_conf": 0.7330037355422974,
"adv/ratio_final_to_reasoning": 1.2402422599598109,
"adv/ratio_step_to_reasoning": 1.607905168107006,
"adv/std_final_conf": 0.8036025762557983,
"adv/std_reasoning": 0.7207197546958923,
"adv/std_step_conf": 0.9333640336990356,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7831508114526982,
"calib/avg_num_step_conf": 6.44140625,
"calib/ece": 0.19638554216867476,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5783132530120482,
"calib/gap": 0.4668135637946959,
"calib/mean_conf": 0.6514859437751004,
"calib/mu_c": 0.8502097902097903,
"calib/mu_w": 0.38339622641509435,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1367871485943776,
"calib/std_conf": 0.43866148268199695,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5261176470588236,
"calib/step_q_c_n": 765.0,
"calib/step_q_gap": 0.21495248868778288,
"calib/step_q_w": 0.3111651583710407,
"calib/step_q_w_n": 884.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2422.0,
"completions/max_terminated_length": 2422.0,
"completions/mean_length": 523.34375,
"completions/mean_terminated_length": 531.6508178710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.04157635197043419,
"kl": 0.07187652587890625,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.0034,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03407448157668114,
"mask/share_reasoning": 0.8247135877609253,
"mask/share_step_conf": 0.12558691203594208,
"num_tokens": 38363363.0,
"reward": 0.9599740505218506,
"reward_std": 0.19809526205062866,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7604609131813049,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8540183305740356,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.6849848031997681,
"adv/mean_abs_reasoning": 0.5212238430976868,
"adv/mean_abs_step_conf": 0.7627356648445129,
"adv/ratio_final_to_reasoning": 1.3141854738049454,
"adv/ratio_step_to_reasoning": 1.463355283809537,
"adv/std_final_conf": 0.8790825605392456,
"adv/std_reasoning": 0.7753711342811584,
"adv/std_step_conf": 0.9340617656707764,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7601245085190039,
"calib/avg_num_step_conf": 6.25,
"calib/ece": 0.2584337349397591,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5742971887550201,
"calib/gap": 0.3704062909567496,
"calib/mean_conf": 0.6364257028112449,
"calib/mu_c": 0.7985714285714286,
"calib/mu_w": 0.428165137614679,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16630522088353417,
"calib/std_conf": 0.44016480518654333,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5005243902439025,
"calib/step_q_c_n": 820.0,
"calib/step_q_gap": 0.1758192620387743,
"calib/step_q_w": 0.3247051282051282,
"calib/step_q_w_n": 780.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2644.0,
"completions/max_terminated_length": 2644.0,
"completions/mean_length": 560.58203125,
"completions/mean_terminated_length": 569.4801635742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.04676659405231476,
"kl": 0.0653533935546875,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0637,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.02996118739247322,
"mask/share_reasoning": 0.8411446809768677,
"mask/share_step_conf": 0.11326909065246582,
"num_tokens": 38613008.0,
"reward": 0.9299562573432922,
"reward_std": 0.20216163992881775,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7168090343475342,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8391972184181213,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.4965021014213562,
"adv/mean_abs_reasoning": 0.3882831335067749,
"adv/mean_abs_step_conf": 0.7505602836608887,
"adv/ratio_final_to_reasoning": 1.2787114828738577,
"adv/ratio_step_to_reasoning": 1.9330231444312598,
"adv/std_final_conf": 0.7457696795463562,
"adv/std_reasoning": 0.7012252807617188,
"adv/std_step_conf": 0.9331449866294861,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7731105462210924,
"calib/avg_num_step_conf": 5.2734375,
"calib/ece": 0.24417322834645674,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5905511811023622,
"calib/gap": 0.44362204724409443,
"calib/mean_conf": 0.6433858267716536,
"calib/mu_c": 0.8651968503937008,
"calib/mu_w": 0.4215748031496063,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.19377952755905517,
"calib/std_conf": 0.44615897112537706,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5396116504854369,
"calib/step_q_c_n": 618.0,
"calib/step_q_gap": 0.17704334447450792,
"calib/step_q_w": 0.362568306010929,
"calib/step_q_w_n": 732.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2283.0,
"completions/max_terminated_length": 2283.0,
"completions/mean_length": 518.08203125,
"completions/mean_terminated_length": 518.08203125,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.176,
"grad_norm": 0.06405764073133469,
"kl": 0.07071685791015625,
"learning_rate": 9.722222222222224e-07,
"loss": 0.0184,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03335200995206833,
"mask/share_reasoning": 0.8532591462135315,
"mask/share_step_conf": 0.11338884383440018,
"num_tokens": 38851213.0,
"reward": 0.9515175819396973,
"reward_std": 0.15023109316825867,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7463171482086182,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8590617179870605,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.490718275308609,
"adv/mean_abs_reasoning": 0.3937041163444519,
"adv/mean_abs_step_conf": 0.7531986832618713,
"adv/ratio_final_to_reasoning": 1.2464138802127214,
"adv/ratio_step_to_reasoning": 1.9131084791678872,
"adv/std_final_conf": 0.7293896675109863,
"adv/std_reasoning": 0.6613433361053467,
"adv/std_step_conf": 0.9326786398887634,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8938230994152047,
"calib/avg_num_step_conf": 5.84765625,
"calib/ece": 0.12338645418326699,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6135458167330677,
"calib/gap": 0.6554312865497076,
"calib/mean_conf": 0.6602788844621513,
"calib/mu_c": 0.8691812865497076,
"calib/mu_w": 0.21375000000000002,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.051195219123506046,
"calib/std_conf": 0.444739417316579,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5190594594594594,
"calib/step_q_c_n": 925.0,
"calib/step_q_gap": 0.21079022869022868,
"calib/step_q_w": 0.30826923076923074,
"calib/step_q_w_n": 572.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2650.0,
"completions/max_terminated_length": 2650.0,
"completions/mean_length": 535.546875,
"completions/mean_terminated_length": 537.6470947265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.04973738268017769,
"kl": 0.16271209716796875,
"learning_rate": 9.444444444444445e-07,
"loss": -0.0363,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03301393240690231,
"mask/share_reasoning": 0.8391966223716736,
"mask/share_step_conf": 0.1238832175731659,
"num_tokens": 39094497.0,
"reward": 1.0374794006347656,
"reward_std": 0.14901401102542877,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.8522887229919434,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8929827213287354,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.5144798755645752,
"adv/mean_abs_reasoning": 0.4528118073940277,
"adv/mean_abs_step_conf": 0.7564898729324341,
"adv/ratio_final_to_reasoning": 1.1361891787350968,
"adv/ratio_step_to_reasoning": 1.670649617743187,
"adv/std_final_conf": 0.7716760039329529,
"adv/std_reasoning": 0.7205672860145569,
"adv/std_step_conf": 0.93340003490448,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6497036657642496,
"calib/avg_num_step_conf": 5.40234375,
"calib/ece": 0.228888888888889,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.75,
"calib/gap": 0.3497182995536694,
"calib/mean_conf": 0.7766666666666666,
"calib/mu_c": 0.886300578034682,
"calib/mu_w": 0.5365822784810126,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15952380952380965,
"calib/std_conf": 0.392903316189894,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.544224318658281,
"calib/step_q_c_n": 954.0,
"calib/step_q_gap": 0.0957627801967425,
"calib/step_q_w": 0.4484615384615385,
"calib/step_q_w_n": 429.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2540.0,
"completions/max_terminated_length": 2540.0,
"completions/mean_length": 493.9296875,
"completions/mean_terminated_length": 495.86669921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.05870332941412926,
"kl": 0.0638885498046875,
"learning_rate": 9.166666666666666e-07,
"loss": 0.0712,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03407091647386551,
"mask/share_reasoning": 0.8457885980606079,
"mask/share_step_conf": 0.1162342056632042,
"num_tokens": 39326551.0,
"reward": 0.9671033024787903,
"reward_std": 0.17421361804008484,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7607374787330627,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8414378762245178,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.5943257808685303,
"adv/mean_abs_reasoning": 0.4640789330005646,
"adv/mean_abs_step_conf": 0.7342292070388794,
"adv/ratio_final_to_reasoning": 1.2806566698165702,
"adv/ratio_step_to_reasoning": 1.5821213910565213,
"adv/std_final_conf": 0.8195285797119141,
"adv/std_reasoning": 0.701535701751709,
"adv/std_step_conf": 0.9333325624465942,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7641162029459903,
"calib/avg_num_step_conf": 5.92578125,
"calib/ece": 0.2363199999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.616,
"calib/gap": 0.41958537915984717,
"calib/mean_conf": 0.6648,
"calib/mu_c": 0.8225641025641025,
"calib/mu_w": 0.40297872340425533,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1385599999999999,
"calib/std_conf": 0.43775902046674037,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5240274314214464,
"calib/step_q_c_n": 802.0,
"calib/step_q_gap": 0.20019526358927853,
"calib/step_q_w": 0.32383216783216784,
"calib/step_q_w_n": 715.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2319.0,
"completions/max_terminated_length": 2319.0,
"completions/mean_length": 535.79296875,
"completions/mean_terminated_length": 546.4661254882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.1792,
"grad_norm": 0.02657085470855236,
"kl": 0.06554412841796875,
"learning_rate": 8.88888888888889e-07,
"loss": -0.1271,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03261389583349228,
"mask/share_reasoning": 0.8362942934036255,
"mask/share_step_conf": 0.11156059056520462,
"num_tokens": 39568386.0,
"reward": 0.9644992351531982,
"reward_std": 0.18663950264453888,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7509452700614929,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8608657121658325,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.5595456957817078,
"adv/mean_abs_reasoning": 0.4441044330596924,
"adv/mean_abs_step_conf": 0.7435603141784668,
"adv/ratio_final_to_reasoning": 1.2599417031860587,
"adv/ratio_step_to_reasoning": 1.6742915828505687,
"adv/std_final_conf": 0.7765763401985168,
"adv/std_reasoning": 0.7014807462692261,
"adv/std_step_conf": 0.9342918395996094,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7951638689048761,
"calib/avg_num_step_conf": 5.19140625,
"calib/ece": 0.20862348178137655,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5951417004048583,
"calib/gap": 0.4529303224087397,
"calib/mean_conf": 0.6636842105263158,
"calib/mu_c": 0.861726618705036,
"calib/mu_w": 0.40879629629629627,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1547773279352227,
"calib/std_conf": 0.4324253574325776,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5857077625570777,
"calib/step_q_c_n": 657.0,
"calib/step_q_gap": 0.2105887149380301,
"calib/step_q_w": 0.3751190476190476,
"calib/step_q_w_n": 672.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2674.0,
"completions/max_terminated_length": 2674.0,
"completions/mean_length": 504.5625,
"completions/mean_terminated_length": 510.54547119140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.04750969633460045,
"kl": 0.06903839111328125,
"learning_rate": 8.611111111111112e-07,
"loss": -0.0662,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03275205194950104,
"mask/share_reasoning": 0.845567524433136,
"mask/share_step_conf": 0.10996170341968536,
"num_tokens": 39801738.0,
"reward": 0.9409176707267761,
"reward_std": 0.2118673324584961,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7484175562858582,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8326364755630493,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.5320782661437988,
"adv/mean_abs_reasoning": 0.38035115599632263,
"adv/mean_abs_step_conf": 0.7660174369812012,
"adv/ratio_final_to_reasoning": 1.3989132351919107,
"adv/ratio_step_to_reasoning": 2.0139742574848576,
"adv/std_final_conf": 0.778958797454834,
"adv/std_reasoning": 0.6612679958343506,
"adv/std_step_conf": 0.9337641000747681,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7838235294117648,
"calib/avg_num_step_conf": 5.72265625,
"calib/ece": 0.20632411067193668,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7075098814229249,
"calib/gap": 0.46325751633986934,
"calib/mean_conf": 0.7273517786561265,
"calib/mu_c": 0.9104575163398694,
"calib/mu_w": 0.44720000000000004,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16446640316205524,
"calib/std_conf": 0.42422831914620107,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5701946472019465,
"calib/step_q_c_n": 822.0,
"calib/step_q_gap": 0.21999246990801175,
"calib/step_q_w": 0.3502021772939347,
"calib/step_q_w_n": 643.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2636.0,
"completions/max_terminated_length": 2636.0,
"completions/mean_length": 525.0,
"completions/mean_terminated_length": 527.058837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.03782640025019646,
"kl": 0.0851593017578125,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0339,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032207902520895004,
"mask/share_reasoning": 0.8428486585617065,
"mask/share_step_conf": 0.12103715538978577,
"num_tokens": 40040290.0,
"reward": 0.9744622707366943,
"reward_std": 0.19172057509422302,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7742984294891357,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8582197427749634,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.5948938131332397,
"adv/mean_abs_reasoning": 0.3816094398498535,
"adv/mean_abs_step_conf": 0.7755590677261353,
"adv/ratio_final_to_reasoning": 1.5589074876326545,
"adv/ratio_step_to_reasoning": 2.032337218993544,
"adv/std_final_conf": 0.8434175252914429,
"adv/std_reasoning": 0.6814701557159424,
"adv/std_step_conf": 0.9340186715126038,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7388299266247379,
"calib/avg_num_step_conf": 5.96875,
"calib/ece": 0.27612000000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.552,
"calib/gap": 0.3564544025157233,
"calib/mean_conf": 0.62428,
"calib/mu_c": 0.7754166666666666,
"calib/mu_w": 0.41896226415094334,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16219999999999998,
"calib/std_conf": 0.4515622676885216,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5402421307506053,
"calib/step_q_c_n": 826.0,
"calib/step_q_gap": 0.20239312790160247,
"calib/step_q_w": 0.3378490028490029,
"calib/step_q_w_n": 702.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2935.0,
"completions/max_terminated_length": 2935.0,
"completions/mean_length": 513.85546875,
"completions/mean_terminated_length": 519.9486083984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.1824,
"grad_norm": 0.05200982093811035,
"kl": 0.06806182861328125,
"learning_rate": 8.055555555555557e-07,
"loss": -0.0064,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.032680168747901917,
"mask/share_reasoning": 0.8358561396598816,
"mask/share_step_conf": 0.11974497139453888,
"num_tokens": 40278733.0,
"reward": 0.9288033843040466,
"reward_std": 0.17771439254283905,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7066855430603027,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8431086540222168,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.5560131669044495,
"adv/mean_abs_reasoning": 0.39456892013549805,
"adv/mean_abs_step_conf": 0.7697474956512451,
"adv/ratio_final_to_reasoning": 1.409166152046416,
"adv/ratio_step_to_reasoning": 1.9508568880359554,
"adv/std_final_conf": 0.7747467756271362,
"adv/std_reasoning": 0.6403370499610901,
"adv/std_step_conf": 0.9329255819320679,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7173851789236405,
"calib/avg_num_step_conf": 5.43359375,
"calib/ece": 0.2423320158102767,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7312252964426877,
"calib/gap": 0.27048393913778535,
"calib/mean_conf": 0.788893280632411,
"calib/mu_c": 0.878698224852071,
"calib/mu_w": 0.6082142857142857,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1816205533596838,
"calib/std_conf": 0.3672539380512327,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5233668903803131,
"calib/step_q_c_n": 894.0,
"calib/step_q_gap": 0.10366870124550431,
"calib/step_q_w": 0.41969818913480883,
"calib/step_q_w_n": 497.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2925.0,
"completions/max_terminated_length": 2925.0,
"completions/mean_length": 493.23046875,
"completions/mean_terminated_length": 495.16473388671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.04100465774536133,
"kl": 0.07511138916015625,
"learning_rate": 7.777777777777779e-07,
"loss": 0.008,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0354820191860199,
"mask/share_reasoning": 0.8346220254898071,
"mask/share_step_conf": 0.12598973512649536,
"num_tokens": 40508352.0,
"reward": 0.9622762203216553,
"reward_std": 0.1651277393102646,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7399269342422485,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8549380302429199,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.4922311305999756,
"adv/mean_abs_reasoning": 0.4386613965034485,
"adv/mean_abs_step_conf": 0.7494137287139893,
"adv/ratio_final_to_reasoning": 1.1221209217941883,
"adv/ratio_step_to_reasoning": 1.7084104840032301,
"adv/std_final_conf": 0.7518383264541626,
"adv/std_reasoning": 0.7205957174301147,
"adv/std_step_conf": 0.9345707893371582,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6923623951073815,
"calib/avg_num_step_conf": 5.69140625,
"calib/ece": 0.24757085020242917,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.7854251012145749,
"calib/gap": 0.2964066277912103,
"calib/mean_conf": 0.8197165991902833,
"calib/mu_c": 0.9265189873417721,
"calib/mu_w": 0.6301123595505618,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.21380566801619436,
"calib/std_conf": 0.3551283064132805,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5629658213891952,
"calib/step_q_c_n": 907.0,
"calib/step_q_gap": 0.11854763957101333,
"calib/step_q_w": 0.4444181818181818,
"calib/step_q_w_n": 550.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2519.0,
"completions/max_terminated_length": 2519.0,
"completions/mean_length": 524.23046875,
"completions/mean_terminated_length": 532.5516357421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 91.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.027799900621175766,
"kl": 0.06717681884765625,
"learning_rate": 7.5e-07,
"loss": -0.0282,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.034872934222221375,
"mask/share_reasoning": 0.8314304947853088,
"mask/share_step_conf": 0.11807158589363098,
"num_tokens": 40745715.0,
"reward": 0.9166386127471924,
"reward_std": 0.18919737637043,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7176336050033569,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8000184297561646,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.7102653980255127,
"adv/mean_abs_reasoning": 0.6087853908538818,
"adv/mean_abs_step_conf": 0.7541342973709106,
"adv/ratio_final_to_reasoning": 1.1666925795136034,
"adv/ratio_step_to_reasoning": 1.2387522905455444,
"adv/std_final_conf": 0.8909445405006409,
"adv/std_reasoning": 0.8267565369606018,
"adv/std_step_conf": 0.9344921708106995,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6303801652892562,
"calib/avg_num_step_conf": 5.8203125,
"calib/ece": 0.3572357723577235,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.556910569105691,
"calib/gap": 0.21883636363636372,
"calib/mean_conf": 0.6324390243902438,
"calib/mu_c": 0.7436363636363637,
"calib/mu_w": 0.5247999999999999,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.24890243902439022,
"calib/std_conf": 0.4382417595094173,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5201589595375722,
"calib/step_q_c_n": 692.0,
"calib/step_q_gap": 0.09376798209396309,
"calib/step_q_w": 0.4263909774436091,
"calib/step_q_w_n": 798.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2789.0,
"completions/max_terminated_length": 2789.0,
"completions/mean_length": 593.65234375,
"completions/mean_terminated_length": 600.6917114257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.1856,
"grad_norm": 0.04501314088702202,
"kl": 0.059535980224609375,
"learning_rate": 7.222222222222222e-07,
"loss": -0.1192,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.029303016141057014,
"mask/share_reasoning": 0.8484967350959778,
"mask/share_step_conf": 0.11048145592212677,
"num_tokens": 41001922.0,
"reward": 0.8417038917541504,
"reward_std": 0.24447256326675415,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.6223413944244385,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7743476629257202,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.6777516603469849,
"adv/mean_abs_reasoning": 0.5734829902648926,
"adv/mean_abs_step_conf": 0.7648620009422302,
"adv/ratio_final_to_reasoning": 1.1818164999696512,
"adv/ratio_step_to_reasoning": 1.3337134909423196,
"adv/std_final_conf": 0.867162823677063,
"adv/std_reasoning": 0.7928904891014099,
"adv/std_step_conf": 0.9347115159034729,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7657891281512604,
"calib/avg_num_step_conf": 5.64453125,
"calib/ece": 0.26375000000000004,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5120967741935484,
"calib/gap": 0.41025210084033614,
"calib/mean_conf": 0.5627016129032258,
"calib/mu_c": 0.7876785714285715,
"calib/mu_w": 0.3774264705882353,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1874193548387097,
"calib/std_conf": 0.46484601126711506,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.540097244732577,
"calib/step_q_c_n": 617.0,
"calib/step_q_gap": 0.16794748627847078,
"calib/step_q_w": 0.37214975845410625,
"calib/step_q_w_n": 828.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2534.0,
"completions/max_terminated_length": 2534.0,
"completions/mean_length": 535.44921875,
"completions/mean_terminated_length": 546.1155395507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.0433058999478817,
"kl": 0.068878173828125,
"learning_rate": 6.944444444444446e-07,
"loss": -0.173,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03162894770503044,
"mask/share_reasoning": 0.8328102827072144,
"mask/share_step_conf": 0.11602950841188431,
"num_tokens": 41244821.0,
"reward": 0.8938636183738708,
"reward_std": 0.2151774913072586,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.7044011354446411,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8028572797775269,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.6084058880805969,
"adv/mean_abs_reasoning": 0.5545529127120972,
"adv/mean_abs_step_conf": 0.7464092969894409,
"adv/ratio_final_to_reasoning": 1.0971106167401166,
"adv/ratio_step_to_reasoning": 1.3459658760767312,
"adv/std_final_conf": 0.8169024586677551,
"adv/std_reasoning": 0.7928866744041443,
"adv/std_step_conf": 0.9332429766654968,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7914652162234757,
"calib/avg_num_step_conf": 5.66015625,
"calib/ece": 0.2469758064516129,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6532258064516129,
"calib/gap": 0.42206956755304864,
"calib/mean_conf": 0.6859274193548388,
"calib/mu_c": 0.8595205479452055,
"calib/mu_w": 0.43745098039215685,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.17209677419354838,
"calib/std_conf": 0.43977810888518604,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.553955223880597,
"calib/step_q_c_n": 804.0,
"calib/step_q_gap": 0.18745909984958925,
"calib/step_q_w": 0.3664961240310078,
"calib/step_q_w_n": 645.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3014.0,
"completions/max_terminated_length": 3014.0,
"completions/mean_length": 563.66796875,
"completions/mean_terminated_length": 563.66796875,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.04436732083559036,
"kl": 0.06317901611328125,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0278,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.035566989332437515,
"mask/share_reasoning": 0.8393169045448303,
"mask/share_step_conf": 0.12511610984802246,
"num_tokens": 41493184.0,
"reward": 0.9451636075973511,
"reward_std": 0.21857215464115143,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7356737852096558,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8468407392501831,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.5914150476455688,
"adv/mean_abs_reasoning": 0.37636247277259827,
"adv/mean_abs_step_conf": 0.7463341951370239,
"adv/ratio_final_to_reasoning": 1.5713974969096012,
"adv/ratio_step_to_reasoning": 1.9830196927948394,
"adv/std_final_conf": 0.8054783940315247,
"adv/std_reasoning": 0.6612831354141235,
"adv/std_step_conf": 0.9332849383354187,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7522977346278318,
"calib/avg_num_step_conf": 6.0234375,
"calib/ece": 0.2451778656126482,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6047430830039525,
"calib/gap": 0.40564854368932046,
"calib/mean_conf": 0.6554545454545455,
"calib/mu_c": 0.8206000000000001,
"calib/mu_w": 0.41495145631067964,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15387351778656125,
"calib/std_conf": 0.44099722031906413,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5324347826086957,
"calib/step_q_c_n": 805.0,
"calib/step_q_gap": 0.18633505397911637,
"calib/step_q_w": 0.34609972862957933,
"calib/step_q_w_n": 737.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2976.0,
"completions/max_terminated_length": 2976.0,
"completions/mean_length": 547.703125,
"completions/mean_terminated_length": 549.8510131835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.1888,
"grad_norm": 0.04313787445425987,
"kl": 0.06238555908203125,
"learning_rate": 6.388888888888889e-07,
"loss": -0.002,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032127976417541504,
"mask/share_reasoning": 0.8453376889228821,
"mask/share_step_conf": 0.11862808465957642,
"num_tokens": 41737228.0,
"reward": 0.9590833783149719,
"reward_std": 0.16259649395942688,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7471988201141357,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8561241626739502,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.5764954090118408,
"adv/mean_abs_reasoning": 0.5464339852333069,
"adv/mean_abs_step_conf": 0.7638071775436401,
"adv/ratio_final_to_reasoning": 1.0550138252577734,
"adv/ratio_step_to_reasoning": 1.3978032080444687,
"adv/std_final_conf": 0.8018712401390076,
"adv/std_reasoning": 0.7928348779678345,
"adv/std_step_conf": 0.9337645173072815,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8475838926174497,
"calib/avg_num_step_conf": 5.27734375,
"calib/ece": 0.1528112449799197,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.570281124497992,
"calib/gap": 0.5750194630872483,
"calib/mean_conf": 0.636987951807229,
"calib/mu_c": 0.8679194630872483,
"calib/mu_w": 0.2929,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.09570281124497995,
"calib/std_conf": 0.4432765958372831,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5526944444444445,
"calib/step_q_c_n": 720.0,
"calib/step_q_gap": 0.23023802606092625,
"calib/step_q_w": 0.32245641838351824,
"calib/step_q_w_n": 631.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2948.0,
"completions/max_terminated_length": 2948.0,
"completions/mean_length": 502.83984375,
"completions/mean_terminated_length": 508.8023986816406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.03984364494681358,
"kl": 0.067535400390625,
"learning_rate": 6.111111111111112e-07,
"loss": 0.0287,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03440903127193451,
"mask/share_reasoning": 0.8346933722496033,
"mask/share_step_conf": 0.11917882412672043,
"num_tokens": 41972027.0,
"reward": 0.9897167682647705,
"reward_std": 0.18722940981388092,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.8117296695709229,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8575475811958313,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.5882589817047119,
"adv/mean_abs_reasoning": 0.4754542112350464,
"adv/mean_abs_step_conf": 0.7456690073013306,
"adv/ratio_final_to_reasoning": 1.2372568541913684,
"adv/ratio_step_to_reasoning": 1.5683297984980942,
"adv/std_final_conf": 0.823574960231781,
"adv/std_reasoning": 0.739263117313385,
"adv/std_step_conf": 0.9337601661682129,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8059141201264488,
"calib/avg_num_step_conf": 5.40625,
"calib/ece": 0.19980000000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.62,
"calib/gap": 0.44298867228661754,
"calib/mean_conf": 0.6828399999999999,
"calib/mu_c": 0.8671232876712329,
"calib/mu_w": 0.4241346153846154,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14932000000000006,
"calib/std_conf": 0.42460939038132445,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.582780678851175,
"calib/step_q_c_n": 766.0,
"calib/step_q_gap": 0.21857355911007464,
"calib/step_q_w": 0.36420711974110037,
"calib/step_q_w_n": 618.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3039.0,
"completions/max_terminated_length": 3039.0,
"completions/mean_length": 481.26953125,
"completions/mean_terminated_length": 488.90875244140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.05356639623641968,
"kl": 0.06893157958984375,
"learning_rate": 5.833333333333334e-07,
"loss": 0.0005,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03361428156495094,
"mask/share_reasoning": 0.8343751430511475,
"mask/share_step_conf": 0.11638560891151428,
"num_tokens": 42201496.0,
"reward": 0.9610702395439148,
"reward_std": 0.18741470575332642,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7639027237892151,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8488626480102539,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.5343791842460632,
"adv/mean_abs_reasoning": 0.28797054290771484,
"adv/mean_abs_step_conf": 0.7218352556228638,
"adv/ratio_final_to_reasoning": 1.855673079788978,
"adv/ratio_step_to_reasoning": 2.506628797287049,
"adv/std_final_conf": 0.7881537675857544,
"adv/std_reasoning": 0.6184922456741333,
"adv/std_step_conf": 0.9330692887306213,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7854037059335073,
"calib/avg_num_step_conf": 5.875,
"calib/ece": 0.17584000000000005,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.604,
"calib/gap": 0.4683497223894574,
"calib/mean_conf": 0.6805599999999999,
"calib/mu_c": 0.866026490066225,
"calib/mu_w": 0.39767676767676763,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12620000000000003,
"calib/std_conf": 0.4189650181101043,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5608493771234428,
"calib/step_q_c_n": 883.0,
"calib/step_q_gap": 0.13266902285613208,
"calib/step_q_w": 0.42818035426731077,
"calib/step_q_w_n": 621.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2876.0,
"completions/max_terminated_length": 2876.0,
"completions/mean_length": 596.83984375,
"completions/mean_terminated_length": 606.3135375976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.192,
"grad_norm": 0.05060546100139618,
"kl": 0.05588531494140625,
"learning_rate": 5.555555555555555e-07,
"loss": -0.0949,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.029479168355464935,
"mask/share_reasoning": 0.8440501093864441,
"mask/share_step_conf": 0.11084578931331635,
"num_tokens": 42458143.0,
"reward": 0.9723742604255676,
"reward_std": 0.15452314913272858,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7846351265907288,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8468320965766907,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.6660337448120117,
"adv/mean_abs_reasoning": 0.5127053260803223,
"adv/mean_abs_step_conf": 0.7276250123977661,
"adv/ratio_final_to_reasoning": 1.2990575890909868,
"adv/ratio_step_to_reasoning": 1.4191875437700714,
"adv/std_final_conf": 0.8656556606292725,
"adv/std_reasoning": 0.7753406763076782,
"adv/std_step_conf": 0.9336156249046326,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7831430532160459,
"calib/avg_num_step_conf": 5.578125,
"calib/ece": 0.24133858267716535,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.562992125984252,
"calib/gap": 0.3968432216607398,
"calib/mean_conf": 0.6356692913385826,
"calib/mu_c": 0.8184671532846715,
"calib/mu_w": 0.42162393162393164,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16881889763779526,
"calib/std_conf": 0.4343962967156663,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5725960264900662,
"calib/step_q_c_n": 755.0,
"calib/step_q_gap": 0.19060494179467252,
"calib/step_q_w": 0.3819910846953937,
"calib/step_q_w_n": 673.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2227.0,
"completions/max_terminated_length": 2227.0,
"completions/mean_length": 494.2265625,
"completions/mean_terminated_length": 496.16473388671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.035632502287626266,
"kl": 0.07423782348632812,
"learning_rate": 5.277777777777779e-07,
"loss": -0.0462,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.033344317227602005,
"mask/share_reasoning": 0.839685320854187,
"mask/share_step_conf": 0.12306413054466248,
"num_tokens": 42690929.0,
"reward": 0.9542344212532043,
"reward_std": 0.2006225287914276,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7449023723602295,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8580977320671082,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.5287920236587524,
"adv/mean_abs_reasoning": 0.3742181062698364,
"adv/mean_abs_step_conf": 0.7488981485366821,
"adv/ratio_final_to_reasoning": 1.4130583603495066,
"adv/ratio_step_to_reasoning": 2.001234403117513,
"adv/std_final_conf": 0.7815595865249634,
"adv/std_reasoning": 0.6815541982650757,
"adv/std_step_conf": 0.9339718818664551,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8582563896694768,
"calib/avg_num_step_conf": 5.765625,
"calib/ece": 0.16573122529644274,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6837944664031621,
"calib/gap": 0.5054824033186134,
"calib/mean_conf": 0.7398023715415021,
"calib/mu_c": 0.9276100628930815,
"calib/mu_w": 0.4221276595744681,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13853754940711469,
"calib/std_conf": 0.39932200504078263,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5976770186335404,
"calib/step_q_c_n": 805.0,
"calib/step_q_gap": 0.2225652451611112,
"calib/step_q_w": 0.3751117734724292,
"calib/step_q_w_n": 671.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2116.0,
"completions/max_terminated_length": 2116.0,
"completions/mean_length": 496.69140625,
"completions/mean_terminated_length": 502.5810546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.03509395942091942,
"kl": 0.0664520263671875,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0732,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03328807279467583,
"mask/share_reasoning": 0.8332520127296448,
"mask/share_step_conf": 0.12174117565155029,
"num_tokens": 42924242.0,
"reward": 1.005211353302002,
"reward_std": 0.1748979389667511,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.8209699392318726,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8675776720046997,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.6386405825614929,
"adv/mean_abs_reasoning": 0.5655806064605713,
"adv/mean_abs_step_conf": 0.761902928352356,
"adv/ratio_final_to_reasoning": 1.129176947134263,
"adv/ratio_step_to_reasoning": 1.3471164315912076,
"adv/std_final_conf": 0.8399648070335388,
"adv/std_reasoning": 0.7929946184158325,
"adv/std_step_conf": 0.9347866773605347,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7205112721417068,
"calib/avg_num_step_conf": 5.140625,
"calib/ece": 0.27548780487804875,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.556910569105691,
"calib/gap": 0.323743961352657,
"calib/mean_conf": 0.6293902439024389,
"calib/mu_c": 0.7715217391304348,
"calib/mu_w": 0.4477777777777778,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.17195121951219508,
"calib/std_conf": 0.438842752624257,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5706896551724139,
"calib/step_q_c_n": 667.0,
"calib/step_q_gap": 0.15645236703682058,
"calib/step_q_w": 0.4142372881355933,
"calib/step_q_w_n": 649.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2662.0,
"completions/max_terminated_length": 2662.0,
"completions/mean_length": 522.3359375,
"completions/mean_terminated_length": 528.5296630859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.1952,
"grad_norm": 0.035424672067165375,
"kl": 0.06060791015625,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.0903,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.032264262437820435,
"mask/share_reasoning": 0.8487693667411804,
"mask/share_step_conf": 0.10724763572216034,
"num_tokens": 43164640.0,
"reward": 0.9020237326622009,
"reward_std": 0.23391547799110413,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.687953531742096,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.816093921661377,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.5525701642036438,
"adv/mean_abs_reasoning": 0.48427891731262207,
"adv/mean_abs_step_conf": 0.7636120915412903,
"adv/ratio_final_to_reasoning": 1.1410163532824966,
"adv/ratio_step_to_reasoning": 1.5768022605211762,
"adv/std_final_conf": 0.7689260840415955,
"adv/std_reasoning": 0.739284098148346,
"adv/std_step_conf": 0.9334125518798828,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7401926782273603,
"calib/avg_num_step_conf": 5.94140625,
"calib/ece": 0.18125000000000002,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.7096774193548387,
"calib/gap": 0.41945818882466296,
"calib/mean_conf": 0.7540725806451613,
"calib/mu_c": 0.8809248554913297,
"calib/mu_w": 0.46146666666666675,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11887096774193553,
"calib/std_conf": 0.3948794563593181,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5471383975026014,
"calib/step_q_c_n": 961.0,
"calib/step_q_gap": 0.1796383975026014,
"calib/step_q_w": 0.3675,
"calib/step_q_w_n": 560.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2611.0,
"completions/max_terminated_length": 2611.0,
"completions/mean_length": 517.7890625,
"completions/mean_terminated_length": 526.0079956054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.032069843262434006,
"kl": 0.0685272216796875,
"learning_rate": 4.444444444444445e-07,
"loss": -0.0207,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03259740397334099,
"mask/share_reasoning": 0.8336936235427856,
"mask/share_step_conf": 0.11808392405509949,
"num_tokens": 43402474.0,
"reward": 0.9748326539993286,
"reward_std": 0.17759272456169128,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7816808223724365,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8382970094680786,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.6131725311279297,
"adv/mean_abs_reasoning": 0.48545408248901367,
"adv/mean_abs_step_conf": 0.7590094804763794,
"adv/ratio_final_to_reasoning": 1.2630906881740076,
"adv/ratio_step_to_reasoning": 1.563504166212376,
"adv/std_final_conf": 0.8436364531517029,
"adv/std_reasoning": 0.7574914693832397,
"adv/std_step_conf": 0.9346086978912354,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.8027830755103482,
"calib/avg_num_step_conf": 6.23046875,
"calib/ece": 0.19818181818181813,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.6528925619834711,
"calib/gap": 0.4702097902097901,
"calib/mean_conf": 0.7096694214876034,
"calib/mu_c": 0.9020279720279719,
"calib/mu_w": 0.4318181818181818,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15847107438016522,
"calib/std_conf": 0.42130735896474336,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5844903581267218,
"calib/step_q_c_n": 726.0,
"calib/step_q_gap": 0.26459392544547905,
"calib/step_q_w": 0.31989643268124274,
"calib/step_q_w_n": 869.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2878.0,
"completions/max_terminated_length": 2878.0,
"completions/mean_length": 519.1640625,
"completions/mean_terminated_length": 538.0809936523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.043537501245737076,
"kl": 0.068511962890625,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.1086,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.031707763671875,
"mask/share_reasoning": 0.8220318555831909,
"mask/share_step_conf": 0.11110415309667587,
"num_tokens": 43642300.0,
"reward": 0.9259820580482483,
"reward_std": 0.22024211287498474,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7505718469619751,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.8006108999252319,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.623105525970459,
"adv/mean_abs_reasoning": 0.5284816026687622,
"adv/mean_abs_step_conf": 0.771432101726532,
"adv/ratio_final_to_reasoning": 1.1790486609635198,
"adv/ratio_step_to_reasoning": 1.4597142035425679,
"adv/std_final_conf": 0.829008162021637,
"adv/std_reasoning": 0.7576667070388794,
"adv/std_step_conf": 0.9336704015731812,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7929896907216495,
"calib/avg_num_step_conf": 5.52734375,
"calib/ece": 0.19708502024291497,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6680161943319838,
"calib/gap": 0.4486886597938145,
"calib/mean_conf": 0.7161943319838056,
"calib/mu_c": 0.8924000000000001,
"calib/mu_w": 0.4437113402061856,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.15299595141700403,
"calib/std_conf": 0.4141740850915179,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6124999999999999,
"calib/step_q_c_n": 748.0,
"calib/step_q_gap": 0.2139542728635681,
"calib/step_q_w": 0.3985457271364318,
"calib/step_q_w_n": 667.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2743.0,
"completions/max_terminated_length": 2743.0,
"completions/mean_length": 504.34765625,
"completions/mean_terminated_length": 512.3532104492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.1984,
"grad_norm": 0.035920802503824234,
"kl": 0.06513214111328125,
"learning_rate": 3.8888888888888895e-07,
"loss": -0.1681,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03315415605902672,
"mask/share_reasoning": 0.8398368954658508,
"mask/share_step_conf": 0.11138398945331573,
"num_tokens": 43876453.0,
"reward": 0.9535995721817017,
"reward_std": 0.24708446860313416,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7642765045166016,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8327664136886597,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.6541886329650879,
"adv/mean_abs_reasoning": 0.559043824672699,
"adv/mean_abs_step_conf": 0.7574201822280884,
"adv/ratio_final_to_reasoning": 1.1701920387871074,
"adv/ratio_step_to_reasoning": 1.354849385325975,
"adv/std_final_conf": 0.8290735483169556,
"adv/std_reasoning": 0.7754475474357605,
"adv/std_step_conf": 0.9339062571525574,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7392602206359506,
"calib/avg_num_step_conf": 6.33984375,
"calib/ece": 0.2710843373493975,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6506024096385542,
"calib/gap": 0.35007852044127186,
"calib/mean_conf": 0.7215261044176707,
"calib/mu_c": 0.8832089552238805,
"calib/mu_w": 0.5331304347826087,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22722891566265058,
"calib/std_conf": 0.40691127714917147,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5349680715197956,
"calib/step_q_c_n": 783.0,
"calib/step_q_gap": 0.14157283342455756,
"calib/step_q_w": 0.3933952380952381,
"calib/step_q_w_n": 840.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2927.0,
"completions/max_terminated_length": 2927.0,
"completions/mean_length": 536.58984375,
"completions/mean_terminated_length": 545.107177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.034603968262672424,
"kl": 0.05792999267578125,
"learning_rate": 3.611111111111111e-07,
"loss": -0.1217,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.030966894701123238,
"mask/share_reasoning": 0.8398405313491821,
"mask/share_step_conf": 0.11356760561466217,
"num_tokens": 44115364.0,
"reward": 0.9120051860809326,
"reward_std": 0.237847238779068,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7025140523910522,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8230587244033813,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.6000089645385742,
"adv/mean_abs_reasoning": 0.3837595582008362,
"adv/mean_abs_step_conf": 0.7395734190940857,
"adv/ratio_final_to_reasoning": 1.5635023329492328,
"adv/ratio_step_to_reasoning": 1.9271791497816932,
"adv/std_final_conf": 0.8135210275650024,
"adv/std_reasoning": 0.6815720796585083,
"adv/std_step_conf": 0.9343740344047546,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7540782199167293,
"calib/avg_num_step_conf": 5.90234375,
"calib/ece": 0.22043650793650793,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6706349206349206,
"calib/gap": 0.39143334925943635,
"calib/mean_conf": 0.7353571428571428,
"calib/mu_c": 0.8767080745341617,
"calib/mu_w": 0.4852747252747253,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15845238095238096,
"calib/std_conf": 0.40426922045551494,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5919388954171563,
"calib/step_q_c_n": 851.0,
"calib/step_q_gap": 0.18362071359897442,
"calib/step_q_w": 0.40831818181818186,
"calib/step_q_w_n": 660.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2669.0,
"completions/max_terminated_length": 2669.0,
"completions/mean_length": 557.171875,
"completions/mean_terminated_length": 559.3568725585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.03549211099743843,
"kl": 0.06658172607421875,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0939,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03242797404527664,
"mask/share_reasoning": 0.8466348648071289,
"mask/share_step_conf": 0.11703091114759445,
"num_tokens": 44362072.0,
"reward": 0.9675827622413635,
"reward_std": 0.17826011776924133,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7650222778320312,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8474870324134827,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.5870659947395325,
"adv/mean_abs_reasoning": 0.406009316444397,
"adv/mean_abs_step_conf": 0.7587201595306396,
"adv/ratio_final_to_reasoning": 1.4459421766001057,
"adv/ratio_step_to_reasoning": 1.868725984356929,
"adv/std_final_conf": 0.7885566353797913,
"adv/std_reasoning": 0.6816694736480713,
"adv/std_step_conf": 0.9334494471549988,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7710961877628544,
"calib/avg_num_step_conf": 5.33984375,
"calib/ece": 0.25399209486166,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.549407114624506,
"calib/gap": 0.38102428435761776,
"calib/mean_conf": 0.6072727272727272,
"calib/mu_c": 0.744320987654321,
"calib/mu_w": 0.3632967032967032,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11047430830039523,
"calib/std_conf": 0.45210101612569953,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.524469135802469,
"calib/step_q_c_n": 810.0,
"calib/step_q_gap": 0.2071441806857724,
"calib/step_q_w": 0.3173249551166966,
"calib/step_q_w_n": 557.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2740.0,
"completions/max_terminated_length": 2740.0,
"completions/mean_length": 499.375,
"completions/mean_terminated_length": 501.3333740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.2016,
"grad_norm": 0.03689796105027199,
"kl": 0.07184600830078125,
"learning_rate": 3.055555555555556e-07,
"loss": 0.1004,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03577050566673279,
"mask/share_reasoning": 0.8457111716270447,
"mask/share_step_conf": 0.11461208760738373,
"num_tokens": 44597680.0,
"reward": 0.953514575958252,
"reward_std": 0.16989631950855255,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.731041431427002,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8517688512802124,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.6480185985565186,
"adv/mean_abs_reasoning": 0.45343148708343506,
"adv/mean_abs_step_conf": 0.717402458190918,
"adv/ratio_final_to_reasoning": 1.4291433590655735,
"adv/ratio_step_to_reasoning": 1.5821628594992347,
"adv/std_final_conf": 0.8678444623947144,
"adv/std_reasoning": 0.7392775416374207,
"adv/std_step_conf": 0.9342573881149292,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8118368700265252,
"calib/avg_num_step_conf": 6.41796875,
"calib/ece": 0.20867469879518075,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5542168674698795,
"calib/gap": 0.4598554376657826,
"calib/mean_conf": 0.6195180722891566,
"calib/mu_c": 0.8115862068965518,
"calib/mu_w": 0.35173076923076924,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12293172690763056,
"calib/std_conf": 0.4409607171745155,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5320570749108205,
"calib/step_q_c_n": 841.0,
"calib/step_q_gap": 0.14219423201805242,
"calib/step_q_w": 0.3898628428927681,
"calib/step_q_w_n": 802.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2585.0,
"completions/max_terminated_length": 2585.0,
"completions/mean_length": 562.12890625,
"completions/mean_terminated_length": 575.6200561523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.03430723026394844,
"kl": 0.05941009521484375,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.1591,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.028694037348031998,
"mask/share_reasoning": 0.8382810354232788,
"mask/share_step_conf": 0.10958744585514069,
"num_tokens": 44847193.0,
"reward": 0.9562456607818604,
"reward_std": 0.2037629783153534,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7631875276565552,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.841491162776947,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.6388992071151733,
"adv/mean_abs_reasoning": 0.4522554874420166,
"adv/mean_abs_step_conf": 0.7552422285079956,
"adv/ratio_final_to_reasoning": 1.4126953123970358,
"adv/ratio_step_to_reasoning": 1.6699459696546517,
"adv/std_final_conf": 0.8288201689720154,
"adv/std_reasoning": 0.7392417788505554,
"adv/std_step_conf": 0.9342796802520752,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7233987603305785,
"calib/avg_num_step_conf": 6.33203125,
"calib/ece": 0.27963855421686745,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6626506024096386,
"calib/gap": 0.34462293388429754,
"calib/mean_conf": 0.7169076305220884,
"calib/mu_c": 0.884375,
"calib/mu_w": 0.5397520661157025,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.24124497991967872,
"calib/std_conf": 0.4155091659276454,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5751640513552069,
"calib/step_q_c_n": 701.0,
"calib/step_q_gap": 0.16002274700738078,
"calib/step_q_w": 0.4151413043478261,
"calib/step_q_w_n": 920.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3058.0,
"completions/max_terminated_length": 3058.0,
"completions/mean_length": 492.35546875,
"completions/mean_terminated_length": 502.1633605957031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.055539507418870926,
"kl": 0.0661163330078125,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0214,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03583353012800217,
"mask/share_reasoning": 0.8158671855926514,
"mask/share_step_conf": 0.12876802682876587,
"num_tokens": 45077404.0,
"reward": 0.9033346176147461,
"reward_std": 0.18452668190002441,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6892011761665344,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8229367733001709,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.63950514793396,
"adv/mean_abs_reasoning": 0.5159052610397339,
"adv/mean_abs_step_conf": 0.7172884345054626,
"adv/ratio_final_to_reasoning": 1.2395786517957348,
"adv/ratio_step_to_reasoning": 1.3903491370874363,
"adv/std_final_conf": 0.849496603012085,
"adv/std_reasoning": 0.7755297422409058,
"adv/std_step_conf": 0.9343807697296143,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.8191850006918501,
"calib/avg_num_step_conf": 5.84375,
"calib/ece": 0.18514285714285714,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.6244897959183674,
"calib/gap": 0.5007395876573959,
"calib/mean_conf": 0.6875918367346939,
"calib/mu_c": 0.8899315068493151,
"calib/mu_w": 0.3891919191919192,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1384081632653061,
"calib/std_conf": 0.4220072500763874,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5468974700399468,
"calib/step_q_c_n": 751.0,
"calib/step_q_gap": 0.20658874520773202,
"calib/step_q_w": 0.3403087248322148,
"calib/step_q_w_n": 745.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2460.0,
"completions/max_terminated_length": 2460.0,
"completions/mean_length": 513.36328125,
"completions/mean_terminated_length": 527.795166015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.2048,
"grad_norm": 0.03801960498094559,
"kl": 0.06374359130859375,
"learning_rate": 2.2222222222222224e-07,
"loss": -0.1241,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.034386202692985535,
"mask/share_reasoning": 0.8210378885269165,
"mask/share_step_conf": 0.11723221093416214,
"num_tokens": 45313801.0,
"reward": 0.9600014686584473,
"reward_std": 0.2434854954481125,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7788914442062378,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8356428146362305,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.6938143968582153,
"adv/mean_abs_reasoning": 0.6675748825073242,
"adv/mean_abs_step_conf": 0.7467559576034546,
"adv/ratio_final_to_reasoning": 1.0393057244040382,
"adv/ratio_step_to_reasoning": 1.118610027385597,
"adv/std_final_conf": 0.8796382546424866,
"adv/std_reasoning": 0.87473464012146,
"adv/std_step_conf": 0.9342511296272278,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7879343629343628,
"calib/avg_num_step_conf": 5.59765625,
"calib/ece": 0.25896414342629476,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6374501992031872,
"calib/gap": 0.3452612612612612,
"calib/mean_conf": 0.7113147410358567,
"calib/mu_c": 0.8640000000000001,
"calib/mu_w": 0.5187387387387389,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2062549800796812,
"calib/std_conf": 0.4062365713892538,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.55,
"calib/step_q_c_n": 716.0,
"calib/step_q_gap": 0.1575732217573222,
"calib/step_q_w": 0.39242677824267785,
"calib/step_q_w_n": 717.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2785.0,
"completions/max_terminated_length": 2785.0,
"completions/mean_length": 517.73828125,
"completions/mean_terminated_length": 521.81494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 210.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.06238226965069771,
"kl": 0.0617523193359375,
"learning_rate": 1.9444444444444447e-07,
"loss": 0.0348,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.031888388097286224,
"mask/share_reasoning": 0.8456735610961914,
"mask/share_step_conf": 0.11462554335594177,
"num_tokens": 45552054.0,
"reward": 0.9268547296524048,
"reward_std": 0.25273197889328003,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7135710716247559,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8370133638381958,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.6090821623802185,
"adv/mean_abs_reasoning": 0.41220623254776,
"adv/mean_abs_step_conf": 0.7578562498092651,
"adv/ratio_final_to_reasoning": 1.4776151214784157,
"adv/ratio_step_to_reasoning": 1.8385366109704724,
"adv/std_final_conf": 0.8400105237960815,
"adv/std_reasoning": 0.7012957334518433,
"adv/std_step_conf": 0.9326762557029724,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8238798238798238,
"calib/avg_num_step_conf": 5.2421875,
"calib/ece": 0.20282868525896425,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6334661354581673,
"calib/gap": 0.4949307174307173,
"calib/mean_conf": 0.6903984063745021,
"calib/mu_c": 0.9033566433566432,
"calib/mu_w": 0.4084259259259259,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16175298804780885,
"calib/std_conf": 0.43141091678464316,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5734880636604774,
"calib/step_q_c_n": 754.0,
"calib/step_q_gap": 0.21416833576932093,
"calib/step_q_w": 0.3593197278911565,
"calib/step_q_w_n": 588.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2750.0,
"completions/max_terminated_length": 2750.0,
"completions/mean_length": 488.0234375,
"completions/mean_terminated_length": 489.9372863769531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.0440903939306736,
"kl": 0.06768798828125,
"learning_rate": 1.6666666666666668e-07,
"loss": 0.0089,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.035178959369659424,
"mask/share_reasoning": 0.8498561978340149,
"mask/share_step_conf": 0.11105857044458389,
"num_tokens": 45782932.0,
"reward": 0.975521445274353,
"reward_std": 0.20822137594223022,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7812730073928833,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8611760139465332,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.653519868850708,
"adv/mean_abs_reasoning": 0.554317831993103,
"adv/mean_abs_step_conf": 0.7447904348373413,
"adv/ratio_final_to_reasoning": 1.1789623770552617,
"adv/ratio_step_to_reasoning": 1.3436162285441473,
"adv/std_final_conf": 0.8451783061027527,
"adv/std_reasoning": 0.7929887771606445,
"adv/std_step_conf": 0.9340994358062744,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.7995283018867925,
"calib/avg_num_step_conf": 5.9609375,
"calib/ece": 0.20425000000000001,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.5666666666666667,
"calib/gap": 0.5018276541819207,
"calib/mean_conf": 0.6285833333333334,
"calib/mu_c": 0.8502238805970149,
"calib/mu_w": 0.3483962264150943,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.13725,
"calib/std_conf": 0.44864851839224384,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5661994219653179,
"calib/step_q_c_n": 692.0,
"calib/step_q_gap": 0.23670301908762015,
"calib/step_q_w": 0.32949640287769777,
"calib/step_q_w_n": 834.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3007.0,
"completions/max_terminated_length": 3007.0,
"completions/mean_length": 520.796875,
"completions/mean_terminated_length": 531.1713256835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.208,
"grad_norm": 0.037474848330020905,
"kl": 0.0653839111328125,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.0751,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.032066501677036285,
"mask/share_reasoning": 0.8334785103797913,
"mask/share_step_conf": 0.11492373049259186,
"num_tokens": 46022240.0,
"reward": 0.917759895324707,
"reward_std": 0.2372083067893982,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7431319952011108,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.8009814023971558,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.521324634552002,
"adv/mean_abs_reasoning": 0.34145301580429077,
"adv/mean_abs_step_conf": 0.7322190403938293,
"adv/ratio_final_to_reasoning": 1.526782925973064,
"adv/ratio_step_to_reasoning": 2.1444210667435204,
"adv/std_final_conf": 0.7769902348518372,
"adv/std_reasoning": 0.640144407749176,
"adv/std_step_conf": 0.9338326454162598,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.725809470335018,
"calib/avg_num_step_conf": 4.80859375,
"calib/ece": 0.2730314960629922,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7362204724409449,
"calib/gap": 0.3394616008484622,
"calib/mean_conf": 0.7872834645669291,
"calib/mu_c": 0.9436496350364963,
"calib/mu_w": 0.6041880341880341,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.260472440944882,
"calib/std_conf": 0.3682905049356989,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6186495176848875,
"calib/step_q_c_n": 622.0,
"calib/step_q_gap": 0.11920780996731767,
"calib/step_q_w": 0.49944170771756985,
"calib/step_q_w_n": 609.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1352.0,
"completions/max_terminated_length": 1352.0,
"completions/mean_length": 411.95703125,
"completions/mean_terminated_length": 413.57257080078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.04654751718044281,
"kl": 0.07309722900390625,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0278,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03892698884010315,
"mask/share_reasoning": 0.8334881067276001,
"mask/share_step_conf": 0.12367869168519974,
"num_tokens": 46230245.0,
"reward": 0.9221498370170593,
"reward_std": 0.17452925443649292,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7174800634384155,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8213508129119873,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.6606262922286987,
"adv/mean_abs_reasoning": 0.5360690951347351,
"adv/mean_abs_step_conf": 0.7593021392822266,
"adv/ratio_final_to_reasoning": 1.2323528780607238,
"adv/ratio_step_to_reasoning": 1.416425879002378,
"adv/std_final_conf": 0.8487818241119385,
"adv/std_reasoning": 0.7928498983383179,
"adv/std_step_conf": 0.9338219165802002,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7951929365598431,
"calib/avg_num_step_conf": 5.875,
"calib/ece": 0.22253012048192775,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5863453815261044,
"calib/gap": 0.39522890778286446,
"calib/mean_conf": 0.6810843373493976,
"calib/mu_c": 0.85568345323741,
"calib/mu_w": 0.46045454545454556,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17269076305220887,
"calib/std_conf": 0.40987904587163415,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5547791164658634,
"calib/step_q_c_n": 747.0,
"calib/step_q_gap": 0.14450170563363085,
"calib/step_q_w": 0.41027741083223257,
"calib/step_q_w_n": 757.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2476.0,
"completions/max_terminated_length": 2476.0,
"completions/mean_length": 527.15234375,
"completions/mean_terminated_length": 533.4031982421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.04660060629248619,
"kl": 0.0655059814453125,
"learning_rate": 8.333333333333334e-08,
"loss": 0.0005,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.032303206622600555,
"mask/share_reasoning": 0.8369505405426025,
"mask/share_step_conf": 0.1190275102853775,
"num_tokens": 46470252.0,
"reward": 0.9414454698562622,
"reward_std": 0.20183053612709045,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7443073987960815,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8354585766792297,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.5518897175788879,
"adv/mean_abs_reasoning": 0.46058160066604614,
"adv/mean_abs_step_conf": 0.7634322047233582,
"adv/ratio_final_to_reasoning": 1.1982452550879177,
"adv/ratio_step_to_reasoning": 1.6575395187722661,
"adv/std_final_conf": 0.7794657945632935,
"adv/std_reasoning": 0.7206063270568848,
"adv/std_step_conf": 0.9341621994972229,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8199675324675324,
"calib/avg_num_step_conf": 5.7734375,
"calib/ece": 0.17019685039370086,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5354330708661418,
"calib/gap": 0.5053181818181819,
"calib/mean_conf": 0.6128740157480316,
"calib/mu_c": 0.8118181818181819,
"calib/mu_w": 0.3065,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08838582677165359,
"calib/std_conf": 0.440503478801961,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.559171528588098,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": 0.1234710454963106,
"calib/step_q_w": 0.43570048309178744,
"calib/step_q_w_n": 621.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2915.0,
"completions/max_terminated_length": 2915.0,
"completions/mean_length": 488.0703125,
"completions/mean_terminated_length": 488.0703125,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.2112,
"grad_norm": 0.055203624069690704,
"kl": 0.0720977783203125,
"learning_rate": 5.555555555555556e-08,
"loss": 0.1019,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03624938800930977,
"mask/share_reasoning": 0.8318265676498413,
"mask/share_step_conf": 0.131924107670784,
"num_tokens": 46700582.0,
"reward": 0.9913797378540039,
"reward_std": 0.146388441324234,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.8021363019943237,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8618730902671814,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.6371850967407227,
"adv/mean_abs_reasoning": 0.4301682710647583,
"adv/mean_abs_step_conf": 0.7736830711364746,
"adv/ratio_final_to_reasoning": 1.4812461531938501,
"adv/ratio_step_to_reasoning": 1.7985591294807584,
"adv/std_final_conf": 0.8231662511825562,
"adv/std_reasoning": 0.701428234577179,
"adv/std_step_conf": 0.9334186911582947,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7222439660795825,
"calib/avg_num_step_conf": 5.66796875,
"calib/ece": 0.2613147410358567,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6733067729083665,
"calib/gap": 0.30401630789302025,
"calib/mean_conf": 0.7252191235059761,
"calib/mu_c": 0.8523972602739727,
"calib/mu_w": 0.5483809523809524,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20243027888446233,
"calib/std_conf": 0.4011063531864865,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5576440460947503,
"calib/step_q_c_n": 781.0,
"calib/step_q_gap": 0.1511962849007204,
"calib/step_q_w": 0.40644776119402987,
"calib/step_q_w_n": 670.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2966.0,
"completions/max_terminated_length": 2966.0,
"completions/mean_length": 543.89453125,
"completions/mean_terminated_length": 548.1771850585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.036017391830682755,
"kl": 0.0663299560546875,
"learning_rate": 2.777777777777778e-08,
"loss": -0.0326,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034367334097623825,
"mask/share_reasoning": 0.8351020812988281,
"mask/share_step_conf": 0.12271807342767715,
"num_tokens": 46944019.0,
"reward": 0.9380004405975342,
"reward_std": 0.20820194482803345,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7090073823928833,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8568372130393982,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.5297156572341919,
"adv/mean_abs_reasoning": 0.44187745451927185,
"adv/mean_abs_step_conf": 0.764366626739502,
"adv/ratio_final_to_reasoning": 1.1987840787452737,
"adv/ratio_step_to_reasoning": 1.7298158548755858,
"adv/std_final_conf": 0.7672268748283386,
"adv/std_reasoning": 0.7205674648284912,
"adv/std_step_conf": 0.9337111115455627,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.8338223717168826,
"calib/avg_num_step_conf": 5.44140625,
"calib/ece": 0.19406504065040642,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6463414634146342,
"calib/gap": 0.496431689687254,
"calib/mean_conf": 0.6830081300813008,
"calib/mu_c": 0.8626114649681529,
"calib/mu_w": 0.3661797752808989,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.119430894308943,
"calib/std_conf": 0.4381992622424606,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5943325242718447,
"calib/step_q_c_n": 824.0,
"calib/step_q_gap": 0.27080001109082535,
"calib/step_q_w": 0.32353251318101933,
"calib/step_q_w_n": 569.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2950.0,
"completions/max_terminated_length": 2950.0,
"completions/mean_length": 542.28515625,
"completions/mean_terminated_length": 548.7154541015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.03856263682246208,
"kl": 0.0735321044921875,
"learning_rate": 0.0,
"loss": -0.0412,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03279662877321243,
"mask/share_reasoning": 0.8446874618530273,
"mask/share_step_conf": 0.11079715937376022,
"num_tokens": 47190892.0,
"reward": 0.9715377688407898,
"reward_std": 0.18524646759033203,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7729078531265259,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8545427322387695,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.02450802539009601,
"train_runtime": 14799.4515,
"train_samples_per_second": 3.46,
"train_steps_per_second": 0.014
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 47190892,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}