Files
PureRL-1.5B-v7-s2-l2-kl-w3-b1/trainer_state.json
ModelHub XC f5bc2eba8c 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-l2-kl-w3-b1
Source: Original Platform
2026-06-04 14:44:32 +08:00

12243 lines
502 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.773959219455719,
"adv/mean_abs_reasoning": 0.47714588046073914,
"adv/mean_abs_step_conf": 0.7502421140670776,
"adv/ratio_final_to_reasoning": 1.622059942565935,
"adv/ratio_step_to_reasoning": 1.5723537492194897,
"adv/std_final_conf": 0.9294352531433105,
"adv/std_reasoning": 0.7393431663513184,
"adv/std_step_conf": 0.9357826709747314,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.38076182006817844,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.2003187250996017,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": -0.026059730250481805,
"calib/mean_conf": 0.8737051792828686,
"calib/mu_c": 0.865606936416185,
"calib/mu_w": 0.8916666666666668,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19239043824701207,
"calib/std_conf": 0.09027744273295583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7959393232205367,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.006446568895645877,
"calib/step_q_w": 0.8023858921161826,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 474.94921875,
"completions/mean_terminated_length": 478.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.04299500212073326,
"kl": 0.000291675329208374,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0136,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03466901555657387,
"mask/share_reasoning": 0.8340686559677124,
"mask/share_step_conf": 0.12344987690448761,
"num_tokens": 229171.0,
"reward": 1.264374852180481,
"reward_std": 0.26098379492759705,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7142800688743591,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7420004606246948,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.7672724723815918,
"adv/mean_abs_reasoning": 0.5104547739028931,
"adv/mean_abs_step_conf": 0.770934522151947,
"adv/ratio_final_to_reasoning": 1.503115479781084,
"adv/ratio_step_to_reasoning": 1.5102895722914849,
"adv/std_final_conf": 0.9330522418022156,
"adv/std_reasoning": 0.7575037479400635,
"adv/std_step_conf": 0.9358851313591003,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.44343065693430656,
"calib/avg_num_step_conf": 5.05859375,
"calib/ece": 0.3349411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": 0.002352468143016151,
"calib/mean_conf": 0.8721960784313726,
"calib/mu_c": 0.8732846715328467,
"calib/mu_w": 0.8709322033898306,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3349411764705883,
"calib/std_conf": 0.07627016470309335,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7954391371340525,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.011011892552009073,
"calib/step_q_w": 0.7844272445820434,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 492.9765625,
"completions/mean_terminated_length": 494.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.04044683277606964,
"kl": 0.00037539005279541016,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0158,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03364308178424835,
"mask/share_reasoning": 0.8523939251899719,
"mask/share_step_conf": 0.11005672812461853,
"num_tokens": 458661.0,
"reward": 1.198354721069336,
"reward_std": 0.24474793672561646,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6320762038230896,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7291916012763977,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.7638648152351379,
"adv/mean_abs_reasoning": 0.4602765142917633,
"adv/mean_abs_step_conf": 0.7489376068115234,
"adv/ratio_final_to_reasoning": 1.659578082993246,
"adv/ratio_step_to_reasoning": 1.6271471247320293,
"adv/std_final_conf": 0.9298396706581116,
"adv/std_reasoning": 0.7392901182174683,
"adv/std_step_conf": 0.9354903697967529,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.37831234083557025,
"calib/avg_num_step_conf": 5.04296875,
"calib/ece": 0.23330708661417332,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.33070866141732286,
"calib/gap": -0.01997935164154463,
"calib/mean_conf": 0.8830708661417324,
"calib/mu_c": 0.8762275449101795,
"calib/mu_w": 0.8962068965517241,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.22944881889763788,
"calib/std_conf": 0.046108598460652704,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7731469440832249,
"calib/step_q_c_n": 769.0,
"calib/step_q_gap": -0.006010144039380494,
"calib/step_q_w": 0.7791570881226054,
"calib/step_q_w_n": 522.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1909.0,
"completions/max_terminated_length": 1909.0,
"completions/mean_length": 499.40625,
"completions/mean_terminated_length": 503.3385925292969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0032,
"grad_norm": 0.04884733632206917,
"kl": 0.00032722949981689453,
"learning_rate": 7.5e-07,
"loss": -0.0388,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.033127740025520325,
"mask/share_reasoning": 0.8509975671768188,
"mask/share_step_conf": 0.10806218534708023,
"num_tokens": 691765.0,
"reward": 1.2584209442138672,
"reward_std": 0.24406561255455017,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7049156427383423,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7422913312911987,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.7799098491668701,
"adv/mean_abs_reasoning": 0.49395906925201416,
"adv/mean_abs_step_conf": 0.7630683183670044,
"adv/ratio_final_to_reasoning": 1.5788956974671235,
"adv/ratio_step_to_reasoning": 1.5448007048893615,
"adv/std_final_conf": 0.930604100227356,
"adv/std_reasoning": 0.739273726940155,
"adv/std_step_conf": 0.935773491859436,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5133688061190556,
"calib/avg_num_step_conf": 4.98046875,
"calib/ece": 0.26297619047619053,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.2619047619047619,
"calib/gap": 0.002369803791153924,
"calib/mean_conf": 0.8780555555555557,
"calib/mu_c": 0.8789677419354839,
"calib/mu_w": 0.87659793814433,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.26297619047619053,
"calib/std_conf": 0.04478109109802321,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7936363636363637,
"calib/step_q_c_n": 759.0,
"calib/step_q_gap": 0.012570472163495494,
"calib/step_q_w": 0.7810658914728682,
"calib/step_q_w_n": 516.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2343.0,
"completions/max_terminated_length": 2343.0,
"completions/mean_length": 513.3203125,
"completions/mean_terminated_length": 515.3333740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.04170985519886017,
"kl": 0.0006768703460693359,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0059,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03208368271589279,
"mask/share_reasoning": 0.8533977270126343,
"mask/share_step_conf": 0.11061234772205353,
"num_tokens": 929343.0,
"reward": 1.2184948921203613,
"reward_std": 0.2545901834964752,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6823722124099731,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7179338335990906,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.7979696989059448,
"adv/mean_abs_reasoning": 0.4005493223667145,
"adv/mean_abs_step_conf": 0.7814561128616333,
"adv/ratio_final_to_reasoning": 1.9921883631983293,
"adv/ratio_step_to_reasoning": 1.9509610158476005,
"adv/std_final_conf": 0.9299609065055847,
"adv/std_reasoning": 0.640365481376648,
"adv/std_step_conf": 0.9355432391166687,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5075853350189633,
"calib/avg_num_step_conf": 4.875,
"calib/ece": 0.33939024390243894,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.3089430894308943,
"calib/gap": 0.00040255506021680265,
"calib/mean_conf": 0.8800406504065041,
"calib/mu_c": 0.8802255639097744,
"calib/mu_w": 0.8798230088495576,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.33939024390243894,
"calib/std_conf": 0.0459364882111211,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.7968796433878157,
"calib/step_q_c_n": 673.0,
"calib/step_q_gap": 0.011749208605206896,
"calib/step_q_w": 0.7851304347826088,
"calib/step_q_w_n": 575.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3058.0,
"completions/max_terminated_length": 3058.0,
"completions/mean_length": 531.83984375,
"completions/mean_terminated_length": 531.83984375,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.04463869705796242,
"kl": 0.00034159421920776367,
"learning_rate": 1.25e-06,
"loss": 0.0342,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03385140001773834,
"mask/share_reasoning": 0.8535634279251099,
"mask/share_step_conf": 0.11258512735366821,
"num_tokens": 1172182.0,
"reward": 1.1342566013336182,
"reward_std": 0.20135310292243958,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6097691059112549,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.6813250780105591,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.7624006271362305,
"adv/mean_abs_reasoning": 0.37178319692611694,
"adv/mean_abs_step_conf": 0.7744120359420776,
"adv/ratio_final_to_reasoning": 2.0506591837385795,
"adv/ratio_step_to_reasoning": 2.0829667460629575,
"adv/std_final_conf": 0.9308306574821472,
"adv/std_reasoning": 0.6402367949485779,
"adv/std_step_conf": 0.9357687830924988,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.45032532532532527,
"calib/avg_num_step_conf": 5.11328125,
"calib/ece": 0.29960937499999996,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.25390625,
"calib/gap": -0.009524524524524569,
"calib/mean_conf": 0.877734375,
"calib/mu_c": 0.8737162162162162,
"calib/mu_w": 0.8832407407407408,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.29960937499999996,
"calib/std_conf": 0.04397646465280463,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7874965986394558,
"calib/step_q_c_n": 735.0,
"calib/step_q_gap": -0.007189812510369986,
"calib/step_q_w": 0.7946864111498257,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1221.0,
"completions/max_terminated_length": 1221.0,
"completions/mean_length": 428.1796875,
"completions/mean_terminated_length": 429.8588562011719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.0064,
"grad_norm": 0.04509090259671211,
"kl": 0.007303744554519653,
"learning_rate": 1.5e-06,
"loss": -0.0218,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.0379551500082016,
"mask/share_reasoning": 0.8295083045959473,
"mask/share_step_conf": 0.12863025069236755,
"num_tokens": 1387748.0,
"reward": 1.2086174488067627,
"reward_std": 0.20253591239452362,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6597577929496765,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.7209259867668152,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.7563202977180481,
"adv/mean_abs_reasoning": 0.45961794257164,
"adv/mean_abs_step_conf": 0.733751654624939,
"adv/ratio_final_to_reasoning": 1.6455412804084808,
"adv/ratio_step_to_reasoning": 1.5964382298033766,
"adv/std_final_conf": 0.9309816360473633,
"adv/std_reasoning": 0.7393760681152344,
"adv/std_step_conf": 0.9360485672950745,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.3975895502289254,
"calib/avg_num_step_conf": 5.3359375,
"calib/ece": 0.2612698412698413,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2619047619047619,
"calib/gap": -0.01635335308376007,
"calib/mean_conf": 0.8776190476190477,
"calib/mu_c": 0.8715189873417721,
"calib/mu_w": 0.8878723404255322,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.255952380952381,
"calib/std_conf": 0.046750410768564156,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7927122641509434,
"calib/step_q_c_n": 848.0,
"calib/step_q_gap": 0.02499025642893571,
"calib/step_q_w": 0.7677220077220077,
"calib/step_q_w_n": 518.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2233.0,
"completions/max_terminated_length": 2233.0,
"completions/mean_length": 516.0234375,
"completions/mean_terminated_length": 520.0866088867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.04311549291014671,
"kl": 0.0002957582473754883,
"learning_rate": 1.75e-06,
"loss": 0.0227,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.031259916722774506,
"mask/share_reasoning": 0.8517844080924988,
"mask/share_step_conf": 0.10914316773414612,
"num_tokens": 1627274.0,
"reward": 1.2275378704071045,
"reward_std": 0.27290546894073486,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.6782523393630981,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7290366888046265,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.7779233455657959,
"adv/mean_abs_reasoning": 0.45598095655441284,
"adv/mean_abs_step_conf": 0.7926830053329468,
"adv/ratio_final_to_reasoning": 1.7060434967374898,
"adv/ratio_step_to_reasoning": 1.7384125234588712,
"adv/std_final_conf": 0.9291698336601257,
"adv/std_reasoning": 0.720556914806366,
"adv/std_step_conf": 0.9360244870185852,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4444444444444444,
"calib/avg_num_step_conf": 4.72265625,
"calib/ece": 0.33764,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.272,
"calib/gap": -0.008557165861513893,
"calib/mean_conf": 0.8736399999999999,
"calib/mu_c": 0.8697037037037036,
"calib/mu_w": 0.8782608695652175,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.33564,
"calib/std_conf": 0.05211478101268392,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7928643216080402,
"calib/step_q_c_n": 597.0,
"calib/step_q_gap": 0.03082183794790949,
"calib/step_q_w": 0.7620424836601307,
"calib/step_q_w_n": 612.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2638.0,
"completions/max_terminated_length": 2638.0,
"completions/mean_length": 524.265625,
"completions/mean_terminated_length": 524.265625,
"completions/min_length": 168.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.03729280084371567,
"kl": 0.000550001859664917,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0413,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03387031704187393,
"mask/share_reasoning": 0.8588097095489502,
"mask/share_step_conf": 0.10732000321149826,
"num_tokens": 1867998.0,
"reward": 1.191420555114746,
"reward_std": 0.2365012764930725,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6184738874435425,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7317929863929749,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.7659152150154114,
"adv/mean_abs_reasoning": 0.37201279401779175,
"adv/mean_abs_step_conf": 0.7619096040725708,
"adv/ratio_final_to_reasoning": 2.058841059586733,
"adv/ratio_step_to_reasoning": 2.0480736585530765,
"adv/std_final_conf": 0.9287598133087158,
"adv/std_reasoning": 0.6404236555099487,
"adv/std_step_conf": 0.9355725646018982,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4804807889870557,
"calib/avg_num_step_conf": 4.87109375,
"calib/ece": 0.25428,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.288,
"calib/gap": 0.005627696733100596,
"calib/mean_conf": 0.87676,
"calib/mu_c": 0.8788535031847134,
"calib/mu_w": 0.8732258064516129,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.25152,
"calib/std_conf": 0.07093872285289607,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7624759284731774,
"calib/step_q_c_n": 727.0,
"calib/step_q_gap": 0.013918236165485043,
"calib/step_q_w": 0.7485576923076923,
"calib/step_q_w_n": 520.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2793.0,
"completions/max_terminated_length": 2793.0,
"completions/mean_length": 495.79296875,
"completions/mean_terminated_length": 503.6627197265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.0096,
"grad_norm": 0.06231572851538658,
"kl": 0.000376969575881958,
"learning_rate": 2.25e-06,
"loss": -0.0138,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03376583009958267,
"mask/share_reasoning": 0.8470334410667419,
"mask/share_step_conf": 0.1035757064819336,
"num_tokens": 2102457.0,
"reward": 1.214963674545288,
"reward_std": 0.2494477927684784,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6849726438522339,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.713883638381958,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.7786163091659546,
"adv/mean_abs_reasoning": 0.5259579420089722,
"adv/mean_abs_step_conf": 0.7487469911575317,
"adv/ratio_final_to_reasoning": 1.4803775111597657,
"adv/ratio_step_to_reasoning": 1.42358719462926,
"adv/std_final_conf": 0.9327322840690613,
"adv/std_reasoning": 0.7754129767417908,
"adv/std_step_conf": 0.9361671805381775,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4564078282828283,
"calib/avg_num_step_conf": 4.96484375,
"calib/ece": 0.3181102362204725,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.3346456692913386,
"calib/gap": -0.007467171717171706,
"calib/mean_conf": 0.8850393700787402,
"calib/mu_c": 0.8818055555555555,
"calib/mu_w": 0.8892727272727272,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3181102362204725,
"calib/std_conf": 0.047989073817926636,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.782121661721068,
"calib/step_q_c_n": 674.0,
"calib/step_q_gap": -0.006169795565364122,
"calib/step_q_w": 0.7882914572864321,
"calib/step_q_w_n": 597.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2868.0,
"completions/max_terminated_length": 2868.0,
"completions/mean_length": 515.734375,
"completions/mean_terminated_length": 515.734375,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.04843964800238609,
"kl": 0.00044846534729003906,
"learning_rate": 2.5e-06,
"loss": 0.0816,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.032743364572525024,
"mask/share_reasoning": 0.8583770990371704,
"mask/share_step_conf": 0.10887955129146576,
"num_tokens": 2341285.0,
"reward": 1.1771084070205688,
"reward_std": 0.3006208539009094,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6394945383071899,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7030642032623291,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.7562990188598633,
"adv/mean_abs_reasoning": 0.4163981080055237,
"adv/mean_abs_step_conf": 0.7739204168319702,
"adv/ratio_final_to_reasoning": 1.8162883171645696,
"adv/ratio_step_to_reasoning": 1.8586069483814847,
"adv/std_final_conf": 0.9316168427467346,
"adv/std_reasoning": 0.7013375163078308,
"adv/std_step_conf": 0.9359068870544434,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5338258680095415,
"calib/avg_num_step_conf": 5.55078125,
"calib/ece": 0.2791269841269842,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.34523809523809523,
"calib/gap": 0.0005565862708720726,
"calib/mean_conf": 0.8788095238095239,
"calib/mu_c": 0.879025974025974,
"calib/mu_w": 0.878469387755102,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.2734126984126985,
"calib/std_conf": 0.07808325982470639,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7691400233372229,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.0038918915563941336,
"calib/step_q_w": 0.773031914893617,
"calib/step_q_w_n": 564.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2736.0,
"completions/max_terminated_length": 2736.0,
"completions/mean_length": 530.22265625,
"completions/mean_terminated_length": 532.302001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.04254964739084244,
"kl": 0.0007021427154541016,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0465,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03229888156056404,
"mask/share_reasoning": 0.8458642959594727,
"mask/share_step_conf": 0.11793056130409241,
"num_tokens": 2581502.0,
"reward": 1.2219064235687256,
"reward_std": 0.22927621006965637,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6688515543937683,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7296680212020874,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.7611274123191833,
"adv/mean_abs_reasoning": 0.43331027030944824,
"adv/mean_abs_step_conf": 0.7788050174713135,
"adv/ratio_final_to_reasoning": 1.7565413618643857,
"adv/ratio_step_to_reasoning": 1.7973380065862052,
"adv/std_final_conf": 0.9290933012962341,
"adv/std_reasoning": 0.7015314102172852,
"adv/std_step_conf": 0.9357473254203796,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.49364010475121584,
"calib/avg_num_step_conf": 5.51953125,
"calib/ece": 0.21764227642276426,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.3780487804878049,
"calib/gap": -0.0031739618406285697,
"calib/mean_conf": 0.8824390243902438,
"calib/mu_c": 0.8813939393939394,
"calib/mu_w": 0.884567901234568,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.21467479674796752,
"calib/std_conf": 0.05675996332316081,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7721990740740741,
"calib/step_q_c_n": 864.0,
"calib/step_q_gap": 0.027444975713418307,
"calib/step_q_w": 0.7447540983606558,
"calib/step_q_w_n": 549.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2818.0,
"completions/max_terminated_length": 2818.0,
"completions/mean_length": 501.53515625,
"completions/mean_terminated_length": 509.4960632324219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.0128,
"grad_norm": 0.05267712473869324,
"kl": 0.001438736915588379,
"learning_rate": 3e-06,
"loss": -0.0246,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.035315074026584625,
"mask/share_reasoning": 0.8239625692367554,
"mask/share_step_conf": 0.1250973641872406,
"num_tokens": 2814071.0,
"reward": 1.2636216878890991,
"reward_std": 0.25986015796661377,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.6973953247070312,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.7547677755355835,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.790421187877655,
"adv/mean_abs_reasoning": 0.45352885127067566,
"adv/mean_abs_step_conf": 0.7578126788139343,
"adv/ratio_final_to_reasoning": 1.7428244877102976,
"adv/ratio_step_to_reasoning": 1.6709249625260458,
"adv/std_final_conf": 0.9323068857192993,
"adv/std_reasoning": 0.7014724016189575,
"adv/std_step_conf": 0.9361147880554199,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5766365257259297,
"calib/avg_num_step_conf": 4.88671875,
"calib/ece": 0.2895686274509803,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3764705882352941,
"calib/gap": 0.01803298522669372,
"calib/mean_conf": 0.8782745098039215,
"calib/mu_c": 0.8856291390728475,
"calib/mu_w": 0.8675961538461537,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.28784313725490185,
"calib/std_conf": 0.06189937388620991,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7543232044198895,
"calib/step_q_c_n": 724.0,
"calib/step_q_gap": 0.010736866658978661,
"calib/step_q_w": 0.7435863377609109,
"calib/step_q_w_n": 527.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1373.0,
"completions/max_terminated_length": 1373.0,
"completions/mean_length": 472.28515625,
"completions/mean_terminated_length": 474.13726806640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 81.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.057831306010484695,
"kl": 0.0020656585693359375,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.003,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03480253368616104,
"mask/share_reasoning": 0.8483515977859497,
"mask/share_step_conf": 0.11293961852788925,
"num_tokens": 3039568.0,
"reward": 1.2582135200500488,
"reward_std": 0.24947398900985718,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6782461404800415,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7608873844146729,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.7915438413619995,
"adv/mean_abs_reasoning": 0.5042266249656677,
"adv/mean_abs_step_conf": 0.7892253994941711,
"adv/ratio_final_to_reasoning": 1.5698176220184623,
"adv/ratio_step_to_reasoning": 1.5652196064575303,
"adv/std_final_conf": 0.9285059571266174,
"adv/std_reasoning": 0.7393607497215271,
"adv/std_step_conf": 0.9360507130622864,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.42527734778121773,
"calib/avg_num_step_conf": 5.125,
"calib/ece": 0.35727999999999993,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": -0.010869453044375699,
"calib/mean_conf": 0.90128,
"calib/mu_c": 0.8963235294117646,
"calib/mu_w": 0.9071929824561403,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.35727999999999993,
"calib/std_conf": 0.041616842744254395,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7304722222222223,
"calib/step_q_c_n": 720.0,
"calib/step_q_gap": 0.019728978978979073,
"calib/step_q_w": 0.7107432432432432,
"calib/step_q_w_n": 592.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2478.0,
"completions/max_terminated_length": 2478.0,
"completions/mean_length": 540.88671875,
"completions/mean_terminated_length": 545.1456909179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.04276634752750397,
"kl": 0.004217386245727539,
"learning_rate": 3.5e-06,
"loss": 0.0095,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03212915360927582,
"mask/share_reasoning": 0.847929060459137,
"mask/share_step_conf": 0.11212927103042603,
"num_tokens": 3283435.0,
"reward": 1.2056703567504883,
"reward_std": 0.2518189251422882,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6026976108551025,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7535402178764343,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.7444257140159607,
"adv/mean_abs_reasoning": 0.3867869973182678,
"adv/mean_abs_step_conf": 0.752798318862915,
"adv/ratio_final_to_reasoning": 1.9246399676755672,
"adv/ratio_step_to_reasoning": 1.9462865196667267,
"adv/std_final_conf": 0.9274711608886719,
"adv/std_reasoning": 0.6815034747123718,
"adv/std_step_conf": 0.9360182881355286,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4767543859649123,
"calib/avg_num_step_conf": 4.96484375,
"calib/ece": 0.34996062992125976,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5590551181102362,
"calib/gap": 0.002071428571428724,
"calib/mean_conf": 0.9011417322834645,
"calib/mu_c": 0.9020714285714286,
"calib/mu_w": 0.8999999999999999,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.34996062992125976,
"calib/std_conf": 0.06886506541632316,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6757304964539007,
"calib/step_q_c_n": 705.0,
"calib/step_q_gap": -0.008103425807583498,
"calib/step_q_w": 0.6838339222614842,
"calib/step_q_w_n": 566.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3026.0,
"completions/max_terminated_length": 3026.0,
"completions/mean_length": 480.51171875,
"completions/mean_terminated_length": 480.51171875,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.016,
"grad_norm": 0.04166586697101593,
"kl": 0.007984638214111328,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0245,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03449499234557152,
"mask/share_reasoning": 0.8479899168014526,
"mask/share_step_conf": 0.11751505732536316,
"num_tokens": 3514326.0,
"reward": 1.2254838943481445,
"reward_std": 0.22204411029815674,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6215355396270752,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7608097791671753,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.7652251720428467,
"adv/mean_abs_reasoning": 0.3866991400718689,
"adv/mean_abs_step_conf": 0.757154107093811,
"adv/ratio_final_to_reasoning": 1.9788644264909094,
"adv/ratio_step_to_reasoning": 1.9579927355232605,
"adv/std_final_conf": 0.9266905188560486,
"adv/std_reasoning": 0.6613523960113525,
"adv/std_step_conf": 0.9358673691749573,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.49171952078928827,
"calib/avg_num_step_conf": 6.12890625,
"calib/ece": 0.25693227091633475,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.649402390438247,
"calib/gap": 0.004776603241719357,
"calib/mean_conf": 0.9143027888446216,
"calib/mu_c": 0.9159393939393939,
"calib/mu_w": 0.9111627906976746,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.25693227091633475,
"calib/std_conf": 0.05200953743549559,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6268780019212296,
"calib/step_q_c_n": 1041.0,
"calib/step_q_gap": 0.04862042616365381,
"calib/step_q_w": 0.5782575757575757,
"calib/step_q_w_n": 528.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2679.0,
"completions/max_terminated_length": 2679.0,
"completions/mean_length": 616.54296875,
"completions/mean_terminated_length": 618.9608154296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 203.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.033337414264678955,
"kl": 0.009404182434082031,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0097,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.02647656947374344,
"mask/share_reasoning": 0.8592157363891602,
"mask/share_step_conf": 0.1104014441370964,
"num_tokens": 3781009.0,
"reward": 1.3121362924575806,
"reward_std": 0.2243286669254303,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.6943659782409668,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8024532794952393,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.756464958190918,
"adv/mean_abs_reasoning": 0.4089164435863495,
"adv/mean_abs_step_conf": 0.7704800963401794,
"adv/ratio_final_to_reasoning": 1.849925504478222,
"adv/ratio_step_to_reasoning": 1.8841993478735706,
"adv/std_final_conf": 0.9228365421295166,
"adv/std_reasoning": 0.6816248893737793,
"adv/std_step_conf": 0.9359196424484253,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5401098901098901,
"calib/avg_num_step_conf": 5.92578125,
"calib/ece": 0.20666666666666672,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7103174603174603,
"calib/gap": 0.007098901098900989,
"calib/mean_conf": 0.9191269841269841,
"calib/mu_c": 0.921098901098901,
"calib/mu_w": 0.914,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.20178571428571432,
"calib/std_conf": 0.053652613381367925,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6106968325791856,
"calib/step_q_c_n": 1105.0,
"calib/step_q_gap": 0.04331819180248664,
"calib/step_q_w": 0.567378640776699,
"calib/step_q_w_n": 412.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2797.0,
"completions/max_terminated_length": 2797.0,
"completions/mean_length": 538.015625,
"completions/mean_terminated_length": 540.1255493164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.053884051740169525,
"kl": 0.014090538024902344,
"learning_rate": 4.25e-06,
"loss": 0.0679,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03222394734621048,
"mask/share_reasoning": 0.8395468592643738,
"mask/share_step_conf": 0.12432297319173813,
"num_tokens": 4022269.0,
"reward": 1.3697597980499268,
"reward_std": 0.22861257195472717,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.7480968236923218,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8265707492828369,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.7751752734184265,
"adv/mean_abs_reasoning": 0.3621034622192383,
"adv/mean_abs_step_conf": 0.7781474590301514,
"adv/ratio_final_to_reasoning": 2.140756314970252,
"adv/ratio_step_to_reasoning": 2.1489644265235337,
"adv/std_final_conf": 0.9170754551887512,
"adv/std_reasoning": 0.6403347849845886,
"adv/std_step_conf": 0.9358564615249634,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4807206866086078,
"calib/avg_num_step_conf": 4.78125,
"calib/ece": 0.38624505928853753,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7865612648221344,
"calib/gap": -0.0065663258866592855,
"calib/mean_conf": 0.924901185770751,
"calib/mu_c": 0.9219424460431653,
"calib/mu_w": 0.9285087719298246,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.38086956521739124,
"calib/std_conf": 0.07853204903647092,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5981441717791411,
"calib/step_q_c_n": 652.0,
"calib/step_q_gap": -0.0037089750740056937,
"calib/step_q_w": 0.6018531468531468,
"calib/step_q_w_n": 572.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2744.0,
"completions/max_terminated_length": 2744.0,
"completions/mean_length": 521.83203125,
"completions/mean_terminated_length": 521.83203125,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.0192,
"grad_norm": 0.03436724469065666,
"kl": 0.015043258666992188,
"learning_rate": 4.5e-06,
"loss": -0.0453,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032233573496341705,
"mask/share_reasoning": 0.8630064725875854,
"mask/share_step_conf": 0.10475993156433105,
"num_tokens": 4266578.0,
"reward": 1.2287178039550781,
"reward_std": 0.21607418358325958,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5942296981811523,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.778868556022644,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.7444548606872559,
"adv/mean_abs_reasoning": 0.42004138231277466,
"adv/mean_abs_step_conf": 0.7566590309143066,
"adv/ratio_final_to_reasoning": 1.7723369459176614,
"adv/ratio_step_to_reasoning": 1.8013916313390212,
"adv/std_final_conf": 0.9174063205718994,
"adv/std_reasoning": 0.7013636231422424,
"adv/std_step_conf": 0.9357441067695618,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.555988515176374,
"calib/avg_num_step_conf": 4.80078125,
"calib/ece": 0.3030677290836654,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8645418326693227,
"calib/gap": 0.02592904019688269,
"calib/mean_conf": 0.9365338645418326,
"calib/mu_c": 0.9460377358490567,
"calib/mu_w": 0.920108695652174,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3030677290836654,
"calib/std_conf": 0.0807505741632516,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5960899182561308,
"calib/step_q_c_n": 734.0,
"calib/step_q_gap": 0.029726281892494377,
"calib/step_q_w": 0.5663636363636364,
"calib/step_q_w_n": 495.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2435.0,
"completions/max_terminated_length": 2435.0,
"completions/mean_length": 494.65234375,
"completions/mean_terminated_length": 498.5472412109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.020266666666666665,
"grad_norm": 66.31549835205078,
"kl": 78.52267646789551,
"learning_rate": 4.75e-06,
"loss": 3.7526,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03115307353436947,
"mask/share_reasoning": 0.8544229865074158,
"mask/share_step_conf": 0.10661141574382782,
"num_tokens": 4497969.0,
"reward": 1.2911115884780884,
"reward_std": 0.2555881142616272,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.6604000329971313,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.801536500453949,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.7562682032585144,
"adv/mean_abs_reasoning": 0.3773419260978699,
"adv/mean_abs_step_conf": 0.7754217386245728,
"adv/ratio_final_to_reasoning": 2.004198714622461,
"adv/ratio_step_to_reasoning": 2.054957811455741,
"adv/std_final_conf": 0.9134057760238647,
"adv/std_reasoning": 0.6403542757034302,
"adv/std_step_conf": 0.9357218146324158,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.47950980392156856,
"calib/avg_num_step_conf": 5.45703125,
"calib/ece": 0.3364822134387352,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.8932806324110671,
"calib/gap": -0.0012810457516337914,
"calib/mean_conf": 0.9412252964426877,
"calib/mu_c": 0.940718954248366,
"calib/mu_w": 0.9419999999999998,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3364822134387352,
"calib/std_conf": 0.034965830757530916,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5691792929292929,
"calib/step_q_c_n": 792.0,
"calib/step_q_gap": 0.02997268135904496,
"calib/step_q_w": 0.539206611570248,
"calib/step_q_w_n": 605.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2286.0,
"completions/max_terminated_length": 2286.0,
"completions/mean_length": 472.0,
"completions/mean_terminated_length": 473.85101318359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.05530129000544548,
"kl": 0.04509735107421875,
"learning_rate": 5e-06,
"loss": -0.0349,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0354749858379364,
"mask/share_reasoning": 0.8304793238639832,
"mask/share_step_conf": 0.13013947010040283,
"num_tokens": 4723673.0,
"reward": 1.2788856029510498,
"reward_std": 0.21537214517593384,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.634465217590332,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.803449809551239,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.7452648878097534,
"adv/mean_abs_reasoning": 0.46478623151779175,
"adv/mean_abs_step_conf": 0.7354752421379089,
"adv/ratio_final_to_reasoning": 1.6034573256958131,
"adv/ratio_step_to_reasoning": 1.5823946413734404,
"adv/std_final_conf": 0.9150353670120239,
"adv/std_reasoning": 0.7205474972724915,
"adv/std_step_conf": 0.9360754489898682,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5659854851031322,
"calib/avg_num_step_conf": 5.6484375,
"calib/ece": 0.34500000000000003,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9453125,
"calib/gap": 0.024924879042526493,
"calib/mean_conf": 0.9465625,
"calib/mu_c": 0.9564935064935066,
"calib/mu_w": 0.9315686274509801,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.34500000000000003,
"calib/std_conf": 0.08631734236959568,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5779044117647059,
"calib/step_q_c_n": 816.0,
"calib/step_q_gap": 0.03806314192343596,
"calib/step_q_w": 0.5398412698412699,
"calib/step_q_w_n": 630.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1567.0,
"completions/max_terminated_length": 1567.0,
"completions/mean_length": 492.2421875,
"completions/mean_terminated_length": 494.1725769042969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.0224,
"grad_norm": 0.029058117419481277,
"kl": 0.027462005615234375,
"learning_rate": 4.9722222222222224e-06,
"loss": -0.0362,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.033350586891174316,
"mask/share_reasoning": 0.8396820425987244,
"mask/share_step_conf": 0.12306112051010132,
"num_tokens": 4952647.0,
"reward": 1.3049554824829102,
"reward_std": 0.23162305355072021,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6419129371643066,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8242334127426147,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.7346631288528442,
"adv/mean_abs_reasoning": 0.3285854458808899,
"adv/mean_abs_step_conf": 0.7635392546653748,
"adv/ratio_final_to_reasoning": 2.2358358778895973,
"adv/ratio_step_to_reasoning": 2.3237159899716096,
"adv/std_final_conf": 0.8875892162322998,
"adv/std_reasoning": 0.6184437274932861,
"adv/std_step_conf": 0.9357779622077942,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5129274965800273,
"calib/avg_num_step_conf": 5.9296875,
"calib/ece": 0.2939843750000001,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.95703125,
"calib/gap": 0.0002462380300959355,
"calib/mean_conf": 0.95609375,
"calib/mu_c": 0.9561764705882354,
"calib/mu_w": 0.9559302325581395,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2930078125000001,
"calib/std_conf": 0.026684920103637176,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5590693069306931,
"calib/step_q_c_n": 1010.0,
"calib/step_q_gap": 0.029167732127543533,
"calib/step_q_w": 0.5299015748031496,
"calib/step_q_w_n": 508.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1668.0,
"completions/max_terminated_length": 1668.0,
"completions/mean_length": 474.53515625,
"completions/mean_terminated_length": 476.3961181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.03352082520723343,
"kl": 0.046070098876953125,
"learning_rate": 4.944444444444445e-06,
"loss": -0.05,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03306357562541962,
"mask/share_reasoning": 0.830856204032898,
"mask/share_step_conf": 0.13217401504516602,
"num_tokens": 5175944.0,
"reward": 1.3375494480133057,
"reward_std": 0.17658598721027374,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.6910320520401001,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8256270289421082,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.7495639324188232,
"adv/mean_abs_reasoning": 0.44568973779678345,
"adv/mean_abs_step_conf": 0.7528454065322876,
"adv/ratio_final_to_reasoning": 1.6818065772037007,
"adv/ratio_step_to_reasoning": 1.6891692643718774,
"adv/std_final_conf": 0.9074048399925232,
"adv/std_reasoning": 0.7206984162330627,
"adv/std_step_conf": 0.9363330006599426,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5708492975734355,
"calib/avg_num_step_conf": 5.7109375,
"calib/ece": 0.41808764940239046,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9800796812749004,
"calib/gap": 0.014082375478927545,
"calib/mean_conf": 0.9559362549800797,
"calib/mu_c": 0.9624444444444445,
"calib/mu_w": 0.948362068965517,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.41808764940239046,
"calib/std_conf": 0.06459358016031481,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5693057409879839,
"calib/step_q_c_n": 749.0,
"calib/step_q_gap": 0.012652164550396305,
"calib/step_q_w": 0.5566535764375876,
"calib/step_q_w_n": 713.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2977.0,
"completions/max_terminated_length": 2977.0,
"completions/mean_length": 521.40234375,
"completions/mean_terminated_length": 525.5078735351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.029922185465693474,
"kl": 0.03609466552734375,
"learning_rate": 4.9166666666666665e-06,
"loss": 0.0052,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.033446379005908966,
"mask/share_reasoning": 0.8323598504066467,
"mask/share_step_conf": 0.1263812780380249,
"num_tokens": 5413359.0,
"reward": 1.2099261283874512,
"reward_std": 0.2760199308395386,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.5681461095809937,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7750717997550964,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.7759602665901184,
"adv/mean_abs_reasoning": 0.6207110285758972,
"adv/mean_abs_step_conf": 0.7428168654441833,
"adv/ratio_final_to_reasoning": 1.250115159658772,
"adv/ratio_step_to_reasoning": 1.196719296495238,
"adv/std_final_conf": 0.9179912209510803,
"adv/std_reasoning": 0.826636552810669,
"adv/std_step_conf": 0.936242938041687,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5058531746031745,
"calib/avg_num_step_conf": 6.40625,
"calib/ece": 0.47865853658536595,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.959349593495935,
"calib/gap": 0.0039642857142857535,
"calib/mean_conf": 0.9628861788617886,
"calib/mu_c": 0.9649166666666668,
"calib/mu_w": 0.960952380952381,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.4768699186991871,
"calib/std_conf": 0.04071768822243461,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.555012853470437,
"calib/step_q_c_n": 778.0,
"calib/step_q_gap": 0.04475763305280356,
"calib/step_q_w": 0.5102552204176335,
"calib/step_q_w_n": 862.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2633.0,
"completions/max_terminated_length": 2633.0,
"completions/mean_length": 592.5078125,
"completions/mean_terminated_length": 594.8314208984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.0256,
"grad_norm": 0.04278721660375595,
"kl": 0.029100418090820312,
"learning_rate": 4.888888888888889e-06,
"loss": -0.048,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.030732743442058563,
"mask/share_reasoning": 0.8376131057739258,
"mask/share_step_conf": 0.12774784862995148,
"num_tokens": 5669553.0,
"reward": 1.1673494577407837,
"reward_std": 0.30620497465133667,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.5042706727981567,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7722452878952026,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.7407515048980713,
"adv/mean_abs_reasoning": 0.48812973499298096,
"adv/mean_abs_step_conf": 0.7623431086540222,
"adv/ratio_final_to_reasoning": 1.5175299757322158,
"adv/ratio_step_to_reasoning": 1.5617633059477196,
"adv/std_final_conf": 0.9006784558296204,
"adv/std_reasoning": 0.7394152283668518,
"adv/std_step_conf": 0.9362833499908447,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4982081188505897,
"calib/avg_num_step_conf": 5.96875,
"calib/ece": 0.3764285714285715,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.996031746031746,
"calib/gap": 0.0003766208379486491,
"calib/mean_conf": 0.9676984126984127,
"calib/mu_c": 0.9678523489932884,
"calib/mu_w": 0.9674757281553398,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3764285714285715,
"calib/std_conf": 0.0166744124102322,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5621410891089109,
"calib/step_q_c_n": 808.0,
"calib/step_q_gap": 0.018224422442244204,
"calib/step_q_w": 0.5439166666666667,
"calib/step_q_w_n": 720.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1471.0,
"completions/max_terminated_length": 1471.0,
"completions/mean_length": 489.6875,
"completions/mean_terminated_length": 495.49407958984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.02991301380097866,
"kl": 0.04373931884765625,
"learning_rate": 4.861111111111111e-06,
"loss": -0.0677,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03178451210260391,
"mask/share_reasoning": 0.8335721492767334,
"mask/share_step_conf": 0.12292458117008209,
"num_tokens": 5898137.0,
"reward": 1.237117052078247,
"reward_std": 0.27862468361854553,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6068245768547058,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7774547338485718,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.6941282749176025,
"adv/mean_abs_reasoning": 0.42233434319496155,
"adv/mean_abs_step_conf": 0.7388712167739868,
"adv/ratio_final_to_reasoning": 1.6435515749595888,
"adv/ratio_step_to_reasoning": 1.7494935675475078,
"adv/std_final_conf": 0.8959683179855347,
"adv/std_reasoning": 0.7012890577316284,
"adv/std_step_conf": 0.9362882375717163,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4266475252939568,
"calib/avg_num_step_conf": 5.6015625,
"calib/ece": 0.3290438247011955,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9840637450199203,
"calib/gap": 0.0070508613617716875,
"calib/mean_conf": 0.9625099601593626,
"calib/mu_c": 0.9650943396226412,
"calib/mu_w": 0.9580434782608696,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3290438247011955,
"calib/std_conf": 0.06376371737738691,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.609272076372315,
"calib/step_q_c_n": 838.0,
"calib/step_q_gap": 0.07155395556694588,
"calib/step_q_w": 0.5377181208053691,
"calib/step_q_w_n": 596.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2470.0,
"completions/max_terminated_length": 2470.0,
"completions/mean_length": 520.453125,
"completions/mean_terminated_length": 522.494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 293.0,
"epoch": 0.027733333333333332,
"grad_norm": 109.62390899658203,
"kl": 1576.0367279052734,
"learning_rate": 4.833333333333333e-06,
"loss": 8.7786,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.029633793979883194,
"mask/share_reasoning": 0.8511450290679932,
"mask/share_step_conf": 0.11531488597393036,
"num_tokens": 6136613.0,
"reward": 1.2813361883163452,
"reward_std": 0.25779348611831665,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.6458855271339417,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7978465557098389,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.7589015364646912,
"adv/mean_abs_reasoning": 0.53789222240448,
"adv/mean_abs_step_conf": 0.7639832496643066,
"adv/ratio_final_to_reasoning": 1.4108802932160978,
"adv/ratio_step_to_reasoning": 1.4203277493940274,
"adv/std_final_conf": 0.9002174735069275,
"adv/std_reasoning": 0.7754333019256592,
"adv/std_step_conf": 0.9361553192138672,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.576093023255814,
"calib/avg_num_step_conf": 5.72265625,
"calib/ece": 0.46306299212598434,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.968503937007874,
"calib/gap": -0.0014078759689922782,
"calib/mean_conf": 0.9551889763779527,
"calib/mu_c": 0.9544961240310077,
"calib/mu_w": 0.955904,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4551889763779528,
"calib/std_conf": 0.10139509005828919,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5801657381615598,
"calib/step_q_c_n": 718.0,
"calib/step_q_gap": 0.04477885730479936,
"calib/step_q_w": 0.5353868808567604,
"calib/step_q_w_n": 747.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2123.0,
"completions/max_terminated_length": 2123.0,
"completions/mean_length": 501.37109375,
"completions/mean_terminated_length": 501.37109375,
"completions/min_length": 218.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.0288,
"grad_norm": 0.17514687776565552,
"kl": 0.9313392639160156,
"learning_rate": 4.805555555555556e-06,
"loss": -0.0095,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03224721550941467,
"mask/share_reasoning": 0.8422399163246155,
"mask/share_step_conf": 0.12551286816596985,
"num_tokens": 6370180.0,
"reward": 1.2039215564727783,
"reward_std": 0.2828693687915802,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.5347757935523987,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7869243621826172,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.7057210206985474,
"adv/mean_abs_reasoning": 0.41700735688209534,
"adv/mean_abs_step_conf": 0.772865891456604,
"adv/ratio_final_to_reasoning": 1.6923466913752385,
"adv/ratio_step_to_reasoning": 1.8533627253850202,
"adv/std_final_conf": 0.8598683476448059,
"adv/std_reasoning": 0.6817318797111511,
"adv/std_step_conf": 0.9360816478729248,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5477935839744519,
"calib/avg_num_step_conf": 5.5,
"calib/ece": 0.3089598393574297,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9879518072289156,
"calib/gap": -0.0040903614457831194,
"calib/mean_conf": 0.9646224899598392,
"calib/mu_c": 0.9632590361445782,
"calib/mu_w": 0.9673493975903613,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.3034578313253012,
"calib/std_conf": 0.06454818299485003,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5916505894962486,
"calib/step_q_c_n": 933.0,
"calib/step_q_gap": 0.05791964212782752,
"calib/step_q_w": 0.5337309473684211,
"calib/step_q_w_n": 475.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3015.0,
"completions/max_terminated_length": 3015.0,
"completions/mean_length": 555.08984375,
"completions/mean_terminated_length": 555.08984375,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.02628139778971672,
"kl": 0.032978057861328125,
"learning_rate": 4.777777777777778e-06,
"loss": 0.0195,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.030390523374080658,
"mask/share_reasoning": 0.8543544411659241,
"mask/share_step_conf": 0.11525503545999527,
"num_tokens": 6619227.0,
"reward": 1.279329538345337,
"reward_std": 0.25941693782806396,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.6638847589492798,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.7860590219497681,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.7725566625595093,
"adv/mean_abs_reasoning": 0.5233668088912964,
"adv/mean_abs_step_conf": 0.7560856342315674,
"adv/ratio_final_to_reasoning": 1.4761285000019362,
"adv/ratio_step_to_reasoning": 1.4446572105580482,
"adv/std_final_conf": 0.896633505821228,
"adv/std_reasoning": 0.7575854063034058,
"adv/std_step_conf": 0.9362297058105469,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.46384673272293997,
"calib/avg_num_step_conf": 6.22265625,
"calib/ece": 0.49857707509881427,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9960474308300395,
"calib/gap": -0.0002063213345036008,
"calib/mean_conf": 0.9689328063241106,
"calib/mu_c": 0.9688235294117648,
"calib/mu_w": 0.9690298507462684,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.49857707509881427,
"calib/std_conf": 0.018249680023997687,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5732078907435508,
"calib/step_q_c_n": 659.0,
"calib/step_q_gap": 0.051212173398797134,
"calib/step_q_w": 0.5219957173447537,
"calib/step_q_w_n": 934.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2474.0,
"completions/max_terminated_length": 2474.0,
"completions/mean_length": 594.38671875,
"completions/mean_terminated_length": 596.7177124023438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.029749587178230286,
"kl": 0.037349700927734375,
"learning_rate": 4.75e-06,
"loss": -0.0868,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02785518765449524,
"mask/share_reasoning": 0.8521197438240051,
"mask/share_step_conf": 0.11611880362033844,
"num_tokens": 6878518.0,
"reward": 1.176987648010254,
"reward_std": 0.27402737736701965,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.49552401900291443,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7846944332122803,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.7688121795654297,
"adv/mean_abs_reasoning": 0.6039595603942871,
"adv/mean_abs_step_conf": 0.7581928968429565,
"adv/ratio_final_to_reasoning": 1.2729530749766105,
"adv/ratio_step_to_reasoning": 1.2553703038461386,
"adv/std_final_conf": 0.9099051356315613,
"adv/std_reasoning": 0.8266823887825012,
"adv/std_step_conf": 0.936536967754364,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.49207281268349967,
"calib/avg_num_step_conf": 6.078125,
"calib/ece": 0.4412379032258064,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9798387096774194,
"calib/gap": 0.002922881190056792,
"calib/mean_conf": 0.9589798387096776,
"calib/mu_c": 0.9603587786259542,
"calib/mu_w": 0.9574358974358974,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4359959677419355,
"calib/std_conf": 0.094493789247895,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5554888888888889,
"calib/step_q_c_n": 765.0,
"calib/step_q_gap": 0.031199381935665182,
"calib/step_q_w": 0.5242895069532237,
"calib/step_q_w_n": 791.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2985.0,
"completions/max_terminated_length": 2985.0,
"completions/mean_length": 591.859375,
"completions/mean_terminated_length": 598.8775024414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.032,
"grad_norm": 0.026807932183146477,
"kl": 0.03380012512207031,
"learning_rate": 4.722222222222222e-06,
"loss": -0.0778,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.027108782902359962,
"mask/share_reasoning": 0.8485285043716431,
"mask/share_step_conf": 0.11264392733573914,
"num_tokens": 7137018.0,
"reward": 1.1777077913284302,
"reward_std": 0.3551919758319855,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.5364526510238647,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.7618252038955688,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.7549532651901245,
"adv/mean_abs_reasoning": 0.38709408044815063,
"adv/mean_abs_step_conf": 0.7758951187133789,
"adv/ratio_final_to_reasoning": 1.9503095069707397,
"adv/ratio_step_to_reasoning": 2.0044096717136606,
"adv/std_final_conf": 0.8967974781990051,
"adv/std_reasoning": 0.6613409519195557,
"adv/std_step_conf": 0.9362436532974243,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.49223726114649685,
"calib/avg_num_step_conf": 6.50390625,
"calib/ece": 0.5870316205533596,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9920948616600791,
"calib/gap": 0.0065144639065818355,
"calib/mean_conf": 0.9664782608695652,
"calib/mu_c": 0.9705208333333334,
"calib/mu_w": 0.9640063694267516,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.5870316205533596,
"calib/std_conf": 0.06328001421734168,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5654389799635701,
"calib/step_q_c_n": 549.0,
"calib/step_q_gap": 0.07273351401374922,
"calib/step_q_w": 0.49270546594982084,
"calib/step_q_w_n": 1116.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2672.0,
"completions/max_terminated_length": 2672.0,
"completions/mean_length": 612.84375,
"completions/mean_terminated_length": 612.84375,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.03330698609352112,
"kl": 0.03545379638671875,
"learning_rate": 4.694444444444445e-06,
"loss": -0.0565,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02880018949508667,
"mask/share_reasoning": 0.8523030281066895,
"mask/share_step_conf": 0.11889677494764328,
"num_tokens": 7399818.0,
"reward": 1.1024078130722046,
"reward_std": 0.2412220537662506,
"rewards/accuracy_reward_step": 0.375,
"rewards/final_brier_reward_step": 0.4129304587841034,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7600051164627075,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.7467050552368164,
"adv/mean_abs_reasoning": 0.4965146780014038,
"adv/mean_abs_step_conf": 0.7660564184188843,
"adv/ratio_final_to_reasoning": 1.503893214682981,
"adv/ratio_step_to_reasoning": 1.5428676177358012,
"adv/std_final_conf": 0.9030807614326477,
"adv/std_reasoning": 0.7575876116752625,
"adv/std_step_conf": 0.9363879561424255,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5172176308539945,
"calib/avg_num_step_conf": 5.57421875,
"calib/ece": 0.4602874493927124,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9959514170040485,
"calib/gap": 0.0008768201495474859,
"calib/mean_conf": 0.9689514170040485,
"calib/mu_c": 0.9693809523809525,
"calib/mu_w": 0.968504132231405,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4595587044534411,
"calib/std_conf": 0.01734076391436663,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5947622047244094,
"calib/step_q_c_n": 635.0,
"calib/step_q_gap": 0.07739248250218733,
"calib/step_q_w": 0.5173697222222221,
"calib/step_q_w_n": 792.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2519.0,
"completions/max_terminated_length": 2519.0,
"completions/mean_length": 565.86328125,
"completions/mean_terminated_length": 572.5731201171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.027324911206960678,
"kl": 0.04216766357421875,
"learning_rate": 4.666666666666667e-06,
"loss": -0.0062,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.028847582638263702,
"mask/share_reasoning": 0.8494766354560852,
"mask/share_step_conf": 0.10995703935623169,
"num_tokens": 7651383.0,
"reward": 1.1622298955917358,
"reward_std": 0.29328110814094543,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.5207406282424927,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.7557656764984131,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.7327004671096802,
"adv/mean_abs_reasoning": 0.4044535458087921,
"adv/mean_abs_step_conf": 0.7745844125747681,
"adv/ratio_final_to_reasoning": 1.8115812673727152,
"adv/ratio_step_to_reasoning": 1.9151381428139527,
"adv/std_final_conf": 0.8804807066917419,
"adv/std_reasoning": 0.6816219687461853,
"adv/std_step_conf": 0.9363637566566467,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4688944530046225,
"calib/avg_num_step_conf": 5.75,
"calib/ece": 0.49979999999999997,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.988,
"calib/gap": -0.009604519774011222,
"calib/mean_conf": 0.9638,
"calib/mu_c": 0.958728813559322,
"calib/mu_w": 0.9683333333333333,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.49579999999999996,
"calib/std_conf": 0.06370211927400844,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5878157894736843,
"calib/step_q_c_n": 684.0,
"calib/step_q_gap": 0.03392619556505483,
"calib/step_q_w": 0.5538895939086295,
"calib/step_q_w_n": 788.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2509.0,
"completions/max_terminated_length": 2509.0,
"completions/mean_length": 585.109375,
"completions/mean_terminated_length": 585.109375,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.0352,
"grad_norm": 0.027998236939311028,
"kl": 0.0415496826171875,
"learning_rate": 4.638888888888889e-06,
"loss": 0.022,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.028615616261959076,
"mask/share_reasoning": 0.8616238236427307,
"mask/share_step_conf": 0.10976054519414902,
"num_tokens": 7908043.0,
"reward": 1.1462751626968384,
"reward_std": 0.24753305315971375,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.48835116624832153,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7583495378494263,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.7453949451446533,
"adv/mean_abs_reasoning": 0.6234875917434692,
"adv/mean_abs_step_conf": 0.7739483714103699,
"adv/ratio_final_to_reasoning": 1.1955249070158596,
"adv/ratio_step_to_reasoning": 1.2413212093702852,
"adv/std_final_conf": 0.9072105884552002,
"adv/std_reasoning": 0.8429659008979797,
"adv/std_step_conf": 0.936280369758606,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4666860916860917,
"calib/avg_num_step_conf": 5.90625,
"calib/ece": 0.39868514342629496,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9920318725099602,
"calib/gap": 0.006329069347319027,
"calib/mean_conf": 0.9672907211155379,
"calib/mu_c": 0.9700139860139857,
"calib/mu_w": 0.9636849166666667,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39812737450199215,
"calib/std_conf": 0.06283266440044762,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5647769869513641,
"calib/step_q_c_n": 843.0,
"calib/step_q_gap": 0.02021490473910703,
"calib/step_q_w": 0.5445620822122571,
"calib/step_q_w_n": 669.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2668.0,
"completions/max_terminated_length": 2668.0,
"completions/mean_length": 498.63671875,
"completions/mean_terminated_length": 502.56298828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.02821074239909649,
"kl": 0.0689544677734375,
"learning_rate": 4.611111111111112e-06,
"loss": 0.0305,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03244972601532936,
"mask/share_reasoning": 0.8274343013763428,
"mask/share_step_conf": 0.13230347633361816,
"num_tokens": 8140806.0,
"reward": 1.2510292530059814,
"reward_std": 0.3187180757522583,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.58431476354599,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8049654960632324,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.699095606803894,
"adv/mean_abs_reasoning": 0.41293585300445557,
"adv/mean_abs_step_conf": 0.7703684568405151,
"adv/ratio_final_to_reasoning": 1.6929883944864212,
"adv/ratio_step_to_reasoning": 1.8655886894669882,
"adv/std_final_conf": 0.8799118399620056,
"adv/std_reasoning": 0.7013152837753296,
"adv/std_step_conf": 0.9362316131591797,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.49717675421900775,
"calib/avg_num_step_conf": 5.109375,
"calib/ece": 0.4048181818181818,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9960474308300395,
"calib/gap": 0.008815569090216968,
"calib/mean_conf": 0.9660830039525691,
"calib/mu_c": 0.9699507042253519,
"calib/mu_w": 0.9611351351351349,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4048181818181818,
"calib/std_conf": 0.062200976598366195,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5695566433566432,
"calib/step_q_c_n": 715.0,
"calib/step_q_gap": 0.029053551732135086,
"calib/step_q_w": 0.5405030916245082,
"calib/step_q_w_n": 593.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2899.0,
"completions/max_terminated_length": 2899.0,
"completions/mean_length": 564.66015625,
"completions/mean_terminated_length": 566.8745727539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.02963513694703579,
"kl": 0.0429534912109375,
"learning_rate": 4.583333333333333e-06,
"loss": 0.0041,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02836042456328869,
"mask/share_reasoning": 0.8690199851989746,
"mask/share_step_conf": 0.09871330857276917,
"num_tokens": 8394615.0,
"reward": 1.2391767501831055,
"reward_std": 0.2501593828201294,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5834301710128784,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.793164849281311,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.6682970523834229,
"adv/mean_abs_reasoning": 0.3160165548324585,
"adv/mean_abs_step_conf": 0.7482949495315552,
"adv/ratio_final_to_reasoning": 2.1147533006229113,
"adv/ratio_step_to_reasoning": 2.3678979410691836,
"adv/std_final_conf": 0.8350497484207153,
"adv/std_reasoning": 0.6184996962547302,
"adv/std_step_conf": 0.9360457062721252,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5037674037674038,
"calib/avg_num_step_conf": 5.6328125,
"calib/ece": 0.23240239043824695,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0005545454545455852,
"calib/mean_conf": 0.9694541832669322,
"calib/mu_c": 0.9696,
"calib/mu_w": 0.9690454545454544,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23240239043824695,
"calib/std_conf": 0.012041135190817554,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5679854555125725,
"calib/step_q_c_n": 1034.0,
"calib/step_q_gap": -0.0008944464482117942,
"calib/step_q_w": 0.5688799019607843,
"calib/step_q_w_n": 408.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2349.0,
"completions/max_terminated_length": 2349.0,
"completions/mean_length": 509.48828125,
"completions/mean_terminated_length": 511.4862976074219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.0384,
"grad_norm": 0.035114437341690063,
"kl": 0.054393768310546875,
"learning_rate": 4.555555555555556e-06,
"loss": -0.0151,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.033349648118019104,
"mask/share_reasoning": 0.8350939750671387,
"mask/share_step_conf": 0.12765014171600342,
"num_tokens": 8627756.0,
"reward": 1.355540156364441,
"reward_std": 0.21335530281066895,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.7375601530075073,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8164474964141846,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.6995407938957214,
"adv/mean_abs_reasoning": 0.44902467727661133,
"adv/mean_abs_step_conf": 0.7672286629676819,
"adv/ratio_final_to_reasoning": 1.5579116901514645,
"adv/ratio_step_to_reasoning": 1.7086558975355564,
"adv/std_final_conf": 0.8746408224105835,
"adv/std_reasoning": 0.7393237352371216,
"adv/std_step_conf": 0.9361535310745239,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5347666666666667,
"calib/avg_num_step_conf": 5.80859375,
"calib/ece": 0.4689551020408164,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9755102040816327,
"calib/gap": 0.02292133333333346,
"calib/mean_conf": 0.9561387755102042,
"calib/mu_c": 0.9678333333333334,
"calib/mu_w": 0.944912,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4676489795918368,
"calib/std_conf": 0.1099609073694876,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5881790808240887,
"calib/step_q_c_n": 631.0,
"calib/step_q_gap": 0.11911482848764005,
"calib/step_q_w": 0.4690642523364486,
"calib/step_q_w_n": 856.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2918.0,
"completions/max_terminated_length": 2918.0,
"completions/mean_length": 540.04296875,
"completions/mean_terminated_length": 548.6151123046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.03515337035059929,
"kl": 0.049747467041015625,
"learning_rate": 4.527777777777778e-06,
"loss": -0.0503,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.030141158029437065,
"mask/share_reasoning": 0.837505578994751,
"mask/share_step_conf": 0.11672825366258621,
"num_tokens": 8873103.0,
"reward": 1.1600000858306885,
"reward_std": 0.2818068265914917,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.5091338753700256,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.762855052947998,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.7247108221054077,
"adv/mean_abs_reasoning": 0.5161527991294861,
"adv/mean_abs_step_conf": 0.7817589044570923,
"adv/ratio_final_to_reasoning": 1.4040625631163168,
"adv/ratio_step_to_reasoning": 1.5145881331566202,
"adv/std_final_conf": 0.8897721767425537,
"adv/std_reasoning": 0.7754086852073669,
"adv/std_step_conf": 0.9362268447875977,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5271099744245524,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.4236932270916334,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9960159362549801,
"calib/gap": 0.010359015345268507,
"calib/mean_conf": 0.9655258964143426,
"calib/mu_c": 0.9702720588235294,
"calib/mu_w": 0.9599130434782609,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4236932270916334,
"calib/std_conf": 0.06272537315110564,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.58784,
"calib/step_q_c_n": 750.0,
"calib/step_q_gap": 0.06180626324786331,
"calib/step_q_w": 0.5260337367521367,
"calib/step_q_w_n": 585.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2594.0,
"completions/max_terminated_length": 2594.0,
"completions/mean_length": 538.31640625,
"completions/mean_terminated_length": 538.31640625,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.028706612065434456,
"kl": 0.106964111328125,
"learning_rate": 4.5e-06,
"loss": -0.0183,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.030747022479772568,
"mask/share_reasoning": 0.8551157116889954,
"mask/share_step_conf": 0.11413724720478058,
"num_tokens": 9117800.0,
"reward": 1.2067832946777344,
"reward_std": 0.3105279803276062,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.5579559803009033,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7774146795272827,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.7692740559577942,
"adv/mean_abs_reasoning": 0.41159647703170776,
"adv/mean_abs_step_conf": 0.7705304622650146,
"adv/ratio_final_to_reasoning": 1.8690005840320454,
"adv/ratio_step_to_reasoning": 1.8720531036169583,
"adv/std_final_conf": 0.868905246257782,
"adv/std_reasoning": 0.6404502391815186,
"adv/std_step_conf": 0.9361722469329834,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5618107769423559,
"calib/avg_num_step_conf": 5.52734375,
"calib/ece": 0.4203137795275591,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9960629921259843,
"calib/gap": 0.004658521303257945,
"calib/mean_conf": 0.9714948818897639,
"calib/mu_c": 0.973585714285714,
"calib/mu_w": 0.9689271929824561,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4203137795275591,
"calib/std_conf": 0.015466978203573886,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5696733285094067,
"calib/step_q_c_n": 691.0,
"calib/step_q_gap": 0.01542487547073268,
"calib/step_q_w": 0.554248453038674,
"calib/step_q_w_n": 724.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2743.0,
"completions/max_terminated_length": 2743.0,
"completions/mean_length": 531.41796875,
"completions/mean_terminated_length": 531.41796875,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.0416,
"grad_norm": 0.034416694194078445,
"kl": 0.0525054931640625,
"learning_rate": 4.472222222222223e-06,
"loss": -0.0403,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.031083395704627037,
"mask/share_reasoning": 0.8509421348571777,
"mask/share_step_conf": 0.11797446012496948,
"num_tokens": 9359931.0,
"reward": 1.2292721271514893,
"reward_std": 0.2425519824028015,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.5735056400299072,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7886128425598145,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.7616095542907715,
"adv/mean_abs_reasoning": 0.533380389213562,
"adv/mean_abs_step_conf": 0.7608983516693115,
"adv/ratio_final_to_reasoning": 1.427891931710725,
"adv/ratio_step_to_reasoning": 1.4265585444399471,
"adv/std_final_conf": 0.8854411840438843,
"adv/std_reasoning": 0.7577327489852905,
"adv/std_step_conf": 0.9363117218017578,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.49098298196596385,
"calib/avg_num_step_conf": 5.08203125,
"calib/ece": 0.4785019920318725,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0010485775971551536,
"calib/mean_conf": 0.9725258964143426,
"calib/mu_c": 0.9730564516129031,
"calib/mu_w": 0.972007874015748,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4785019920318725,
"calib/std_conf": 0.014743299061754455,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5764660347551344,
"calib/step_q_c_n": 633.0,
"calib/step_q_gap": 0.024854657509625344,
"calib/step_q_w": 0.551611377245509,
"calib/step_q_w_n": 668.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2952.0,
"completions/max_terminated_length": 2952.0,
"completions/mean_length": 598.98828125,
"completions/mean_terminated_length": 598.98828125,
"completions/min_length": 190.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.02472682110965252,
"kl": 0.052127838134765625,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0055,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.029470203444361687,
"mask/share_reasoning": 0.8670581579208374,
"mask/share_step_conf": 0.10347166657447815,
"num_tokens": 9620032.0,
"reward": 1.1567916870117188,
"reward_std": 0.3067583441734314,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.5111952424049377,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7543189525604248,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.6801141500473022,
"adv/mean_abs_reasoning": 0.46634307503700256,
"adv/mean_abs_step_conf": 0.7382851839065552,
"adv/ratio_final_to_reasoning": 1.4583987335790025,
"adv/ratio_step_to_reasoning": 1.583137444140186,
"adv/std_final_conf": 0.8588567972183228,
"adv/std_reasoning": 0.7393189668655396,
"adv/std_step_conf": 0.9362447261810303,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.526853428261879,
"calib/avg_num_step_conf": 5.00390625,
"calib/ece": 0.2572332015810277,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9960474308300395,
"calib/gap": -0.001999071351183934,
"calib/mean_conf": 0.9718577075098815,
"calib/mu_c": 0.9712967032967031,
"calib/mu_w": 0.9732957746478871,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2548616600790514,
"calib/std_conf": 0.03903909449631632,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5804437837837838,
"calib/step_q_c_n": 925.0,
"calib/step_q_gap": -0.0006741937443061063,
"calib/step_q_w": 0.5811179775280899,
"calib/step_q_w_n": 356.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2352.0,
"completions/max_terminated_length": 2352.0,
"completions/mean_length": 476.0390625,
"completions/mean_terminated_length": 477.9059143066406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.03796948492527008,
"kl": 0.066650390625,
"learning_rate": 4.416666666666667e-06,
"loss": -0.0581,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.034183815121650696,
"mask/share_reasoning": 0.8409860730171204,
"mask/share_step_conf": 0.12092389166355133,
"num_tokens": 9849146.0,
"reward": 1.3258066177368164,
"reward_std": 0.2873002290725708,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.7234610915184021,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7941542863845825,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.6904580593109131,
"adv/mean_abs_reasoning": 0.3798826038837433,
"adv/mean_abs_step_conf": 0.7600710391998291,
"adv/ratio_final_to_reasoning": 1.8175564036151974,
"adv/ratio_step_to_reasoning": 2.0008050682742926,
"adv/std_final_conf": 0.8443465232849121,
"adv/std_reasoning": 0.68148273229599,
"adv/std_step_conf": 0.9358599185943604,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5600777624482628,
"calib/avg_num_step_conf": 5.75390625,
"calib/ece": 0.44073517786561267,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9920948616600791,
"calib/gap": 0.013663113006396332,
"calib/mean_conf": 0.9703794466403161,
"calib/mu_c": 0.9768059701492535,
"calib/mu_w": 0.9631428571428572,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.44073517786561267,
"calib/std_conf": 0.06500713165870504,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5658237017310254,
"calib/step_q_c_n": 751.0,
"calib/step_q_gap": 0.026537552146537857,
"calib/step_q_w": 0.5392861495844875,
"calib/step_q_w_n": 722.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2876.0,
"completions/max_terminated_length": 2876.0,
"completions/mean_length": 450.90625,
"completions/mean_terminated_length": 452.6745300292969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.0448,
"grad_norm": 0.03713899478316307,
"kl": 0.0667572021484375,
"learning_rate": 4.388888888888889e-06,
"loss": 0.0299,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.034279078245162964,
"mask/share_reasoning": 0.8265150785446167,
"mask/share_step_conf": 0.13529960811138153,
"num_tokens": 10068946.0,
"reward": 1.2144601345062256,
"reward_std": 0.20866739749908447,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.5526596307754517,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7869584560394287,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.7522804737091064,
"adv/mean_abs_reasoning": 0.5812476873397827,
"adv/mean_abs_step_conf": 0.7755796909332275,
"adv/ratio_final_to_reasoning": 1.294251125801628,
"adv/ratio_step_to_reasoning": 1.3343359600841616,
"adv/std_final_conf": 0.8822755813598633,
"adv/std_reasoning": 0.7929232716560364,
"adv/std_step_conf": 0.9361656904220581,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.51890756302521,
"calib/avg_num_step_conf": 4.81640625,
"calib/ece": 0.4436106798418973,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9841897233201581,
"calib/gap": 0.010437271541452353,
"calib/mean_conf": 0.9732549486166008,
"calib/mu_c": 0.9781641791044775,
"calib/mu_w": 0.9677269075630252,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4436106798418973,
"calib/std_conf": 0.0633822962934615,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5934626291793313,
"calib/step_q_c_n": 658.0,
"calib/step_q_gap": 0.036869930048896604,
"calib/step_q_w": 0.5565926991304347,
"calib/step_q_w_n": 575.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2872.0,
"completions/max_terminated_length": 2872.0,
"completions/mean_length": 522.05078125,
"completions/mean_terminated_length": 522.05078125,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.028802556917071342,
"kl": 0.06124114990234375,
"learning_rate": 4.361111111111112e-06,
"loss": 0.0244,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032978661358356476,
"mask/share_reasoning": 0.8547524213790894,
"mask/share_step_conf": 0.11226895451545715,
"num_tokens": 10307815.0,
"reward": 1.203322410583496,
"reward_std": 0.3031473457813263,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.5487642288208008,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7777683734893799,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.7487004399299622,
"adv/mean_abs_reasoning": 0.46431854367256165,
"adv/mean_abs_step_conf": 0.7619017362594604,
"adv/ratio_final_to_reasoning": 1.6124715459521841,
"adv/ratio_step_to_reasoning": 1.6409030968979672,
"adv/std_final_conf": 0.897011935710907,
"adv/std_reasoning": 0.7206284403800964,
"adv/std_step_conf": 0.9361453056335449,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5503915451709429,
"calib/avg_num_step_conf": 5.1640625,
"calib/ece": 0.5321837301587303,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9880952380952381,
"calib/gap": 0.0023173171197556908,
"calib/mean_conf": 0.9789297619047619,
"calib/mu_c": 0.98020796460177,
"calib/mu_w": 0.9778906474820143,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.531350396825397,
"calib/std_conf": 0.01960602502812919,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.571471961102107,
"calib/step_q_c_n": 617.0,
"calib/step_q_gap": 0.04919096819430546,
"calib/step_q_w": 0.5222809929078015,
"calib/step_q_w_n": 705.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2365.0,
"completions/max_terminated_length": 2365.0,
"completions/mean_length": 521.46484375,
"completions/mean_terminated_length": 523.5098266601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.03608904406428337,
"kl": 0.0600738525390625,
"learning_rate": 4.333333333333334e-06,
"loss": -0.0058,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.030586857348680496,
"mask/share_reasoning": 0.8524494171142578,
"mask/share_step_conf": 0.1130574494600296,
"num_tokens": 10547630.0,
"reward": 1.150404453277588,
"reward_std": 0.27872079610824585,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.46460020542144775,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7751356363296509,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.7562808990478516,
"adv/mean_abs_reasoning": 0.6538466215133667,
"adv/mean_abs_step_conf": 0.7737356424331665,
"adv/ratio_final_to_reasoning": 1.1566640771155086,
"adv/ratio_step_to_reasoning": 1.1833595479048429,
"adv/std_final_conf": 0.9265098571777344,
"adv/std_reasoning": 0.8747087121009827,
"adv/std_step_conf": 0.9363182783126831,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6100305110602593,
"calib/avg_num_step_conf": 5.6015625,
"calib/ece": 0.4179729761904761,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9642857142857143,
"calib/gap": 0.03228314645308916,
"calib/mean_conf": 0.9655920238095238,
"calib/mu_c": 0.980196304347826,
"calib/mu_w": 0.9479131578947368,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.4179729761904761,
"calib/std_conf": 0.09740204758261063,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5701288936548913,
"calib/step_q_c_n": 736.0,
"calib/step_q_gap": 0.06116639604266594,
"calib/step_q_w": 0.5089624976122253,
"calib/step_q_w_n": 698.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2879.0,
"completions/max_terminated_length": 2879.0,
"completions/mean_length": 539.61328125,
"completions/mean_terminated_length": 539.61328125,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.048,
"grad_norm": 0.034370165318250656,
"kl": 0.1003265380859375,
"learning_rate": 4.305555555555556e-06,
"loss": -0.0024,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.032215941697359085,
"mask/share_reasoning": 0.8426299095153809,
"mask/share_step_conf": 0.12515416741371155,
"num_tokens": 10790819.0,
"reward": 1.2384848594665527,
"reward_std": 0.3294350504875183,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.5748702883720398,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7990965843200684,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.69017094373703,
"adv/mean_abs_reasoning": 0.5205807685852051,
"adv/mean_abs_step_conf": 0.7614392638206482,
"adv/ratio_final_to_reasoning": 1.3257711106246284,
"adv/ratio_step_to_reasoning": 1.4626726720812797,
"adv/std_final_conf": 0.859300971031189,
"adv/std_reasoning": 0.775497555732727,
"adv/std_step_conf": 0.9362326860427856,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5912034009156311,
"calib/avg_num_step_conf": 5.80078125,
"calib/ece": 0.4221120481927712,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9919678714859438,
"calib/gap": 0.0053081098757360445,
"calib/mean_conf": 0.9803449799196788,
"calib/mu_c": 0.9826899280575541,
"calib/mu_w": 0.977381818181818,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4221120481927712,
"calib/std_conf": 0.015544703797893853,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5270673518742442,
"calib/step_q_c_n": 827.0,
"calib/step_q_gap": 0.011641060080931243,
"calib/step_q_w": 0.515426291793313,
"calib/step_q_w_n": 658.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2959.0,
"completions/max_terminated_length": 2959.0,
"completions/mean_length": 570.2265625,
"completions/mean_terminated_length": 572.4627685546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.023864952847361565,
"kl": 0.0557861328125,
"learning_rate": 4.277777777777778e-06,
"loss": 0.0192,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.032648518681526184,
"mask/share_reasoning": 0.8368463516235352,
"mask/share_step_conf": 0.12659892439842224,
"num_tokens": 11041565.0,
"reward": 1.2223206758499146,
"reward_std": 0.29786819219589233,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5617954730987549,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7898604273796082,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.6998475193977356,
"adv/mean_abs_reasoning": 0.4415852427482605,
"adv/mean_abs_step_conf": 0.7632977962493896,
"adv/ratio_final_to_reasoning": 1.5848525984295756,
"adv/ratio_step_to_reasoning": 1.7285400922796044,
"adv/std_final_conf": 0.8608697056770325,
"adv/std_reasoning": 0.7013309001922607,
"adv/std_step_conf": 0.9359551072120667,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6020815156425278,
"calib/avg_num_step_conf": 5.69921875,
"calib/ece": 0.4211152941176471,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.996078431372549,
"calib/gap": 0.012229041505670946,
"calib/mean_conf": 0.9779780392156863,
"calib/mu_c": 0.9833971830985914,
"calib/mu_w": 0.9711681415929204,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4211152941176471,
"calib/std_conf": 0.06264071647787252,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5504315794191919,
"calib/step_q_c_n": 792.0,
"calib/step_q_gap": 0.039325589913944525,
"calib/step_q_w": 0.5111059895052473,
"calib/step_q_w_n": 667.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1867.0,
"completions/max_terminated_length": 1867.0,
"completions/mean_length": 528.3515625,
"completions/mean_terminated_length": 530.423583984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.02478746324777603,
"kl": 0.06172943115234375,
"learning_rate": 4.25e-06,
"loss": 0.0007,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.030939877033233643,
"mask/share_reasoning": 0.8441919684410095,
"mask/share_step_conf": 0.12096185982227325,
"num_tokens": 11282799.0,
"reward": 1.256608486175537,
"reward_std": 0.22781646251678467,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5757490396499634,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8136557936668396,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.7483422756195068,
"adv/mean_abs_reasoning": 0.5428138971328735,
"adv/mean_abs_step_conf": 0.7645820379257202,
"adv/ratio_final_to_reasoning": 1.378635071010945,
"adv/ratio_step_to_reasoning": 1.4085528059694479,
"adv/std_final_conf": 0.8885745406150818,
"adv/std_reasoning": 0.7753939628601074,
"adv/std_step_conf": 0.9362524151802063,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.546958006577283,
"calib/avg_num_step_conf": 5.3359375,
"calib/ece": 0.5160222222222224,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9920634920634921,
"calib/gap": -0.0037177333670629364,
"calib/mean_conf": 0.976815873015873,
"calib/mu_c": 0.9748389830508475,
"calib/mu_w": 0.9785567164179104,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.5122920634920637,
"calib/std_conf": 0.06378458341465201,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5602868462757529,
"calib/step_q_c_n": 631.0,
"calib/step_q_gap": 0.02064861498323589,
"calib/step_q_w": 0.539638231292517,
"calib/step_q_w_n": 735.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2522.0,
"completions/max_terminated_length": 2522.0,
"completions/mean_length": 504.9140625,
"completions/mean_terminated_length": 504.9140625,
"completions/min_length": 140.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.0512,
"grad_norm": 0.026672683656215668,
"kl": 0.064849853515625,
"learning_rate": 4.222222222222223e-06,
"loss": 0.0202,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.0343143455684185,
"mask/share_reasoning": 0.838284969329834,
"mask/share_step_conf": 0.1274007260799408,
"num_tokens": 11515745.0,
"reward": 1.1532585620880127,
"reward_std": 0.2807466983795166,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.478743314743042,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7697461843490601,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.692025899887085,
"adv/mean_abs_reasoning": 0.4786022901535034,
"adv/mean_abs_step_conf": 0.7627895474433899,
"adv/ratio_final_to_reasoning": 1.44593102482884,
"adv/ratio_step_to_reasoning": 1.5937858283100532,
"adv/std_final_conf": 0.8504143953323364,
"adv/std_reasoning": 0.7393408417701721,
"adv/std_step_conf": 0.9362682104110718,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5810846386731605,
"calib/avg_num_step_conf": 5.66015625,
"calib/ece": 0.4124707228915664,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9959839357429718,
"calib/gap": 0.002727752402263861,
"calib/mean_conf": 0.9827518473895583,
"calib/mu_c": 0.9839240140845068,
"calib/mu_w": 0.9811962616822429,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.4124707228915664,
"calib/std_conf": 0.012489955924533542,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5635376092544987,
"calib/step_q_c_n": 778.0,
"calib/step_q_gap": 0.05391768873780722,
"calib/step_q_w": 0.5096199205166915,
"calib/step_q_w_n": 671.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2386.0,
"completions/max_terminated_length": 2386.0,
"completions/mean_length": 520.98046875,
"completions/mean_terminated_length": 520.98046875,
"completions/min_length": 212.0,
"completions/min_terminated_length": 212.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.023565007373690605,
"kl": 0.06116485595703125,
"learning_rate": 4.194444444444445e-06,
"loss": 0.0253,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03131241351366043,
"mask/share_reasoning": 0.8423235416412354,
"mask/share_step_conf": 0.12636405229568481,
"num_tokens": 11753652.0,
"reward": 1.2214174270629883,
"reward_std": 0.2773900628089905,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5696588754653931,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.784244179725647,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.7077445983886719,
"adv/mean_abs_reasoning": 0.46105533838272095,
"adv/mean_abs_step_conf": 0.7690777778625488,
"adv/ratio_final_to_reasoning": 1.53505347291126,
"adv/ratio_step_to_reasoning": 1.6680812775323277,
"adv/std_final_conf": 0.882941484451294,
"adv/std_reasoning": 0.7205641269683838,
"adv/std_step_conf": 0.9361612200737,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5750131027253668,
"calib/avg_num_step_conf": 5.30859375,
"calib/ece": 0.35825490196078436,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9921568627450981,
"calib/gap": 0.008864976415094206,
"calib/mean_conf": 0.9800588235294118,
"calib/mu_c": 0.9833962264150943,
"calib/mu_w": 0.9745312500000001,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.35739215686274517,
"calib/std_conf": 0.0451282320073157,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5687125153374233,
"calib/step_q_c_n": 815.0,
"calib/step_q_gap": 0.042891514232450856,
"calib/step_q_w": 0.5258210011049724,
"calib/step_q_w_n": 543.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1453.0,
"completions/max_terminated_length": 1453.0,
"completions/mean_length": 502.16796875,
"completions/mean_terminated_length": 504.1372985839844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.027782263234257698,
"kl": 0.0827178955078125,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0097,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03265595808625221,
"mask/share_reasoning": 0.8410335183143616,
"mask/share_step_conf": 0.12240426242351532,
"num_tokens": 11987567.0,
"reward": 1.2991454601287842,
"reward_std": 0.26816582679748535,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.6338658928871155,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8208842873573303,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.6769541501998901,
"adv/mean_abs_reasoning": 0.4819958209991455,
"adv/mean_abs_step_conf": 0.7450014352798462,
"adv/ratio_final_to_reasoning": 1.404481368316864,
"adv/ratio_step_to_reasoning": 1.5456595323492792,
"adv/std_final_conf": 0.8652604222297668,
"adv/std_reasoning": 0.7575814723968506,
"adv/std_step_conf": 0.9360484480857849,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6032391318602058,
"calib/avg_num_step_conf": 5.18359375,
"calib/ece": 0.3813896825396826,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.996031746031746,
"calib/gap": 0.00809154809520718,
"calib/mean_conf": 0.9805960317460318,
"calib/mu_c": 0.9838390728476822,
"calib/mu_w": 0.9757475247524751,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3813896825396826,
"calib/std_conf": 0.016892831675913256,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5683220778061225,
"calib/step_q_c_n": 784.0,
"calib/step_q_gap": 0.02109250322048717,
"calib/step_q_w": 0.5472295745856354,
"calib/step_q_w_n": 543.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2846.0,
"completions/max_terminated_length": 2846.0,
"completions/mean_length": 536.4375,
"completions/mean_terminated_length": 536.4375,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.0544,
"grad_norm": 0.02513463795185089,
"kl": 0.06470489501953125,
"learning_rate": 4.138888888888889e-06,
"loss": 0.025,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.032385122030973434,
"mask/share_reasoning": 0.8494609594345093,
"mask/share_step_conf": 0.11815392971038818,
"num_tokens": 12234191.0,
"reward": 1.2722687721252441,
"reward_std": 0.27277636528015137,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6083289384841919,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8106822967529297,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.6640352606773376,
"adv/mean_abs_reasoning": 0.4049059748649597,
"adv/mean_abs_step_conf": 0.7678852081298828,
"adv/ratio_final_to_reasoning": 1.639973973954818,
"adv/ratio_step_to_reasoning": 1.8964531417101969,
"adv/std_final_conf": 0.8312138915061951,
"adv/std_reasoning": 0.6816580891609192,
"adv/std_step_conf": 0.935936689376831,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6188590116279069,
"calib/avg_num_step_conf": 5.21875,
"calib/ece": 0.295332142857143,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9880952380952381,
"calib/gap": 0.015278343023255814,
"calib/mean_conf": 0.9778718253968255,
"calib/mu_c": 0.9827220930232557,
"calib/mu_w": 0.9674437499999999,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.295332142857143,
"calib/std_conf": 0.039204568791510054,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5293833658008659,
"calib/step_q_c_n": 924.0,
"calib/step_q_gap": 0.015291933762030863,
"calib/step_q_w": 0.514091432038835,
"calib/step_q_w_n": 412.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2797.0,
"completions/max_terminated_length": 2797.0,
"completions/mean_length": 505.328125,
"completions/mean_terminated_length": 505.328125,
"completions/min_length": 175.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.03392941132187843,
"kl": 0.07430267333984375,
"learning_rate": 4.111111111111111e-06,
"loss": 0.0163,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.033117808401584625,
"mask/share_reasoning": 0.8461878299713135,
"mask/share_step_conf": 0.12069441378116608,
"num_tokens": 12471507.0,
"reward": 1.3340625762939453,
"reward_std": 0.2229156792163849,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.6863217949867249,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8256672024726868,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.6531897783279419,
"adv/mean_abs_reasoning": 0.4806157946586609,
"adv/mean_abs_step_conf": 0.746953547000885,
"adv/ratio_final_to_reasoning": 1.359068481700326,
"adv/ratio_step_to_reasoning": 1.5541593832374576,
"adv/std_final_conf": 0.8576918244361877,
"adv/std_reasoning": 0.7575007081031799,
"adv/std_step_conf": 0.9360480904579163,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5677368086458995,
"calib/avg_num_step_conf": 5.58203125,
"calib/ece": 0.41577096234644195,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9920948616600791,
"calib/gap": 0.005495318071715105,
"calib/mean_conf": 0.9809883536507897,
"calib/mu_c": 0.9833776223776223,
"calib/mu_w": 0.9778823043059072,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.41577096234644195,
"calib/std_conf": 0.01716129647237015,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5705359947643979,
"calib/step_q_c_n": 764.0,
"calib/step_q_gap": 0.061252509893888196,
"calib/step_q_w": 0.5092834848705097,
"calib/step_q_w_n": 665.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2682.0,
"completions/max_terminated_length": 2682.0,
"completions/mean_length": 497.57421875,
"completions/mean_terminated_length": 497.57421875,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.02414063923060894,
"kl": 0.0670623779296875,
"learning_rate": 4.083333333333334e-06,
"loss": -0.0015,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03233794867992401,
"mask/share_reasoning": 0.843818187713623,
"mask/share_step_conf": 0.12384383380413055,
"num_tokens": 12704710.0,
"reward": 1.2482346296310425,
"reward_std": 0.2573202848434448,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.5768751502037048,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8055001497268677,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.593941330909729,
"adv/mean_abs_reasoning": 0.34782564640045166,
"adv/mean_abs_step_conf": 0.7776609063148499,
"adv/ratio_final_to_reasoning": 1.707583489188501,
"adv/ratio_step_to_reasoning": 2.2357779374886255,
"adv/std_final_conf": 0.7841689586639404,
"adv/std_reasoning": 0.640178918838501,
"adv/std_step_conf": 0.9358230829238892,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6308143910883637,
"calib/avg_num_step_conf": 5.14453125,
"calib/ece": 0.2601203921568628,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9764705882352941,
"calib/gap": 0.049350526870389766,
"calib/mean_conf": 0.9674145098039215,
"calib/mu_c": 0.9815423076923078,
"calib/mu_w": 0.932191780821918,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.256904705882353,
"calib/std_conf": 0.11191986062013434,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5804673228346456,
"calib/step_q_c_n": 889.0,
"calib/step_q_gap": 0.06184279012436533,
"calib/step_q_w": 0.5186245327102803,
"calib/step_q_w_n": 428.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1258.0,
"completions/max_terminated_length": 1258.0,
"completions/mean_length": 450.76171875,
"completions/mean_terminated_length": 452.5294494628906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.0576,
"grad_norm": 0.027992311865091324,
"kl": 0.06845855712890625,
"learning_rate": 4.055555555555556e-06,
"loss": -0.0377,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.036233823746442795,
"mask/share_reasoning": 0.8332265615463257,
"mask/share_step_conf": 0.12663336098194122,
"num_tokens": 12926337.0,
"reward": 1.3820880651474,
"reward_std": 0.20170414447784424,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.7360745668411255,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8433475494384766,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.7017855048179626,
"adv/mean_abs_reasoning": 0.5232642889022827,
"adv/mean_abs_step_conf": 0.7613639235496521,
"adv/ratio_final_to_reasoning": 1.3411683535488086,
"adv/ratio_step_to_reasoning": 1.4550274874420743,
"adv/std_final_conf": 0.8780732154846191,
"adv/std_reasoning": 0.7754185795783997,
"adv/std_step_conf": 0.9361396431922913,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6526717557251909,
"calib/avg_num_step_conf": 4.9140625,
"calib/ece": 0.4446177165354331,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.952755905511811,
"calib/gap": 0.04878835102091483,
"calib/mean_conf": 0.9603657480314961,
"calib/mu_c": 0.983991603053435,
"calib/mu_w": 0.9352032520325202,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4446177165354331,
"calib/std_conf": 0.12634521874310717,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5507576452599389,
"calib/step_q_c_n": 654.0,
"calib/step_q_gap": 0.030742413471859353,
"calib/step_q_w": 0.5200152317880795,
"calib/step_q_w_n": 604.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1599.0,
"completions/max_terminated_length": 1599.0,
"completions/mean_length": 456.1875,
"completions/mean_terminated_length": 457.97650146484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.03539075702428818,
"kl": 0.076690673828125,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0211,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03572503477334976,
"mask/share_reasoning": 0.8378803730010986,
"mask/share_step_conf": 0.12248837947845459,
"num_tokens": 13150945.0,
"reward": 1.2235780954360962,
"reward_std": 0.28690779209136963,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.5558318495750427,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7964434027671814,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.7039684057235718,
"adv/mean_abs_reasoning": 0.49995529651641846,
"adv/mean_abs_step_conf": 0.7258338928222656,
"adv/ratio_final_to_reasoning": 1.4080627020629104,
"adv/ratio_step_to_reasoning": 1.4517975864636716,
"adv/std_final_conf": 0.8813683986663818,
"adv/std_reasoning": 0.7575625777244568,
"adv/std_step_conf": 0.9361122846603394,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5774436090225565,
"calib/avg_num_step_conf": 5.57421875,
"calib/ece": 0.5015957599712543,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9762845849802372,
"calib/gap": 0.010814957849168527,
"calib/mean_conf": 0.975904060366511,
"calib/mu_c": 0.981589393939394,
"calib/mu_w": 0.9707744360902255,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.5015957599712543,
"calib/std_conf": 0.06488835931336258,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5589189078338228,
"calib/step_q_c_n": 667.0,
"calib/step_q_gap": 0.04176423764912618,
"calib/step_q_w": 0.5171546701846966,
"calib/step_q_w_n": 758.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2134.0,
"completions/max_terminated_length": 2134.0,
"completions/mean_length": 486.2578125,
"completions/mean_terminated_length": 488.16473388671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.03635377436876297,
"kl": 0.07538604736328125,
"learning_rate": 4.000000000000001e-06,
"loss": -0.034,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03347313404083252,
"mask/share_reasoning": 0.8359185457229614,
"mask/share_step_conf": 0.12670202553272247,
"num_tokens": 13382267.0,
"reward": 1.2034459114074707,
"reward_std": 0.26581820845603943,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.49430450797080994,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8109811544418335,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.724047064781189,
"adv/mean_abs_reasoning": 0.6356273293495178,
"adv/mean_abs_step_conf": 0.7706122398376465,
"adv/ratio_final_to_reasoning": 1.139106251963957,
"adv/ratio_step_to_reasoning": 1.2123648626409256,
"adv/std_final_conf": 0.89552241563797,
"adv/std_reasoning": 0.8428938388824463,
"adv/std_step_conf": 0.936430037021637,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6060245611779079,
"calib/avg_num_step_conf": 4.97265625,
"calib/ece": 0.38820039370078746,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.984251968503937,
"calib/gap": 0.00575520478364322,
"calib/mean_conf": 0.9794602362204724,
"calib/mu_c": 0.9817940397350994,
"calib/mu_w": 0.9760388349514562,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.386586220472441,
"calib/std_conf": 0.031379416016721846,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5748436361185983,
"calib/step_q_c_n": 742.0,
"calib/step_q_gap": 0.03467443459768571,
"calib/step_q_w": 0.5401692015209126,
"calib/step_q_w_n": 526.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2189.0,
"completions/max_terminated_length": 2189.0,
"completions/mean_length": 479.8671875,
"completions/mean_terminated_length": 479.8671875,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.0608,
"grad_norm": 0.030560489743947983,
"kl": 0.06566619873046875,
"learning_rate": 3.972222222222223e-06,
"loss": -0.014,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03349709138274193,
"mask/share_reasoning": 0.8486497402191162,
"mask/share_step_conf": 0.11785320192575455,
"num_tokens": 13611905.0,
"reward": 1.2591137886047363,
"reward_std": 0.3345072567462921,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.5999172925949097,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8017332553863525,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.7447444200515747,
"adv/mean_abs_reasoning": 0.6445914506912231,
"adv/mean_abs_step_conf": 0.7659875154495239,
"adv/ratio_final_to_reasoning": 1.155374337113769,
"adv/ratio_step_to_reasoning": 1.1883302433318383,
"adv/std_final_conf": 0.8873608112335205,
"adv/std_reasoning": 0.8429160118103027,
"adv/std_step_conf": 0.9360173940658569,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4945436507936508,
"calib/avg_num_step_conf": 5.984375,
"calib/ece": 0.46773477690288734,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.968503937007874,
"calib/gap": 0.0022207919973545875,
"calib/mean_conf": 0.971671784776903,
"calib/mu_c": 0.9727734375000001,
"calib/mu_w": 0.9705526455026455,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.46773477690288734,
"calib/std_conf": 0.04788829024024461,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5344919556171983,
"calib/step_q_c_n": 721.0,
"calib/step_q_gap": 0.057724954877371004,
"calib/step_q_w": 0.4767670007398273,
"calib/step_q_w_n": 811.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2174.0,
"completions/max_terminated_length": 2174.0,
"completions/mean_length": 536.26953125,
"completions/mean_terminated_length": 538.37255859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.03641160577535629,
"kl": 0.090789794921875,
"learning_rate": 3.944444444444445e-06,
"loss": -0.0299,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.030999979004263878,
"mask/share_reasoning": 0.8410400152206421,
"mask/share_step_conf": 0.12405376881361008,
"num_tokens": 13855510.0,
"reward": 1.2054097652435303,
"reward_std": 0.2900436520576477,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.5219812989234924,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7959815263748169,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.7536922097206116,
"adv/mean_abs_reasoning": 0.5401763916015625,
"adv/mean_abs_step_conf": 0.7711378335952759,
"adv/ratio_final_to_reasoning": 1.3952705476187113,
"adv/ratio_step_to_reasoning": 1.4275667089206519,
"adv/std_final_conf": 0.8953720927238464,
"adv/std_reasoning": 0.7576656937599182,
"adv/std_step_conf": 0.9361147284507751,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5574965439235893,
"calib/avg_num_step_conf": 5.01171875,
"calib/ece": 0.4041764705882354,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9764705882352941,
"calib/gap": 0.0017679401784591597,
"calib/mean_conf": 0.9743333333333333,
"calib/mu_c": 0.9750890410958905,
"calib/mu_w": 0.9733211009174313,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4029803921568628,
"calib/std_conf": 0.029456559352526033,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5775435356200528,
"calib/step_q_c_n": 758.0,
"calib/step_q_gap": 0.01650467847719561,
"calib/step_q_w": 0.5610388571428572,
"calib/step_q_w_n": 525.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1284.0,
"completions/max_terminated_length": 1284.0,
"completions/mean_length": 475.5859375,
"completions/mean_terminated_length": 477.4510192871094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.03324250504374504,
"kl": 0.063262939453125,
"learning_rate": 3.916666666666667e-06,
"loss": -0.0566,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.034234900027513504,
"mask/share_reasoning": 0.8443710207939148,
"mask/share_step_conf": 0.1174878254532814,
"num_tokens": 14083508.0,
"reward": 1.2565596103668213,
"reward_std": 0.2741585969924927,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.5915107727050781,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8041635751724243,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.7119778394699097,
"adv/mean_abs_reasoning": 0.5205633640289307,
"adv/mean_abs_step_conf": 0.7670281529426575,
"adv/ratio_final_to_reasoning": 1.3677063901683657,
"adv/ratio_step_to_reasoning": 1.4734578073381848,
"adv/std_final_conf": 0.8901903033256531,
"adv/std_reasoning": 0.7752787470817566,
"adv/std_step_conf": 0.9362204074859619,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6234294161123428,
"calib/avg_num_step_conf": 4.75,
"calib/ece": 0.4560941176470589,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9686274509803922,
"calib/gap": 0.006737620103473785,
"calib/mean_conf": 0.9717803921568628,
"calib/mu_c": 0.9750303030303031,
"calib/mu_w": 0.9682926829268294,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4551137254901962,
"calib/std_conf": 0.02803832824593159,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5701520826666667,
"calib/step_q_c_n": 612.0,
"calib/step_q_gap": 0.04724744690507732,
"calib/step_q_w": 0.5229046357615894,
"calib/step_q_w_n": 604.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1112.0,
"completions/max_terminated_length": 1112.0,
"completions/mean_length": 462.03515625,
"completions/mean_terminated_length": 463.8470764160156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 203.0,
"epoch": 0.064,
"grad_norm": 0.038515787571668625,
"kl": 0.0582733154296875,
"learning_rate": 3.88888888888889e-06,
"loss": -0.0229,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0341600701212883,
"mask/share_reasoning": 0.8478833436965942,
"mask/share_step_conf": 0.11405040323734283,
"num_tokens": 14310645.0,
"reward": 1.2330572605133057,
"reward_std": 0.2662425935268402,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.544211208820343,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8097797632217407,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.7021837830543518,
"adv/mean_abs_reasoning": 0.49892711639404297,
"adv/mean_abs_step_conf": 0.7648937702178955,
"adv/ratio_final_to_reasoning": 1.4073874920435887,
"adv/ratio_step_to_reasoning": 1.533077167154405,
"adv/std_final_conf": 0.8997812867164612,
"adv/std_reasoning": 0.757564902305603,
"adv/std_step_conf": 0.9359845519065857,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5786557855626326,
"calib/avg_num_step_conf": 4.72265625,
"calib/ece": 0.3507233201581028,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9841897233201581,
"calib/gap": 0.011121815286624037,
"calib/mean_conf": 0.9712766798418973,
"calib/mu_c": 0.975496815286624,
"calib/mu_w": 0.964375,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3507233201581028,
"calib/std_conf": 0.029815283266292767,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5859243101182655,
"calib/step_q_c_n": 761.0,
"calib/step_q_gap": 0.030174310118265546,
"calib/step_q_w": 0.55575,
"calib/step_q_w_n": 448.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2504.0,
"completions/max_terminated_length": 2504.0,
"completions/mean_length": 426.35546875,
"completions/mean_terminated_length": 426.35546875,
"completions/min_length": 114.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.026272548362612724,
"kl": 0.078460693359375,
"learning_rate": 3.861111111111112e-06,
"loss": -0.0135,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03931958228349686,
"mask/share_reasoning": 0.834287166595459,
"mask/share_step_conf": 0.12639322876930237,
"num_tokens": 14523856.0,
"reward": 1.2799744606018066,
"reward_std": 0.28759801387786865,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.6383060812950134,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8002746105194092,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.7885167002677917,
"adv/mean_abs_reasoning": 0.6170656085014343,
"adv/mean_abs_step_conf": 0.778965950012207,
"adv/ratio_final_to_reasoning": 1.2778490478228604,
"adv/ratio_step_to_reasoning": 1.2623713577296156,
"adv/std_final_conf": 0.9230625629425049,
"adv/std_reasoning": 0.8100219964981079,
"adv/std_step_conf": 0.9363936185836792,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.552384,
"calib/avg_num_step_conf": 5.3125,
"calib/ece": 0.45596000000000014,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.928,
"calib/gap": 0.0196160000000003,
"calib/mean_conf": 0.95396,
"calib/mu_c": 0.9637680000000002,
"calib/mu_w": 0.9441519999999999,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.45496000000000014,
"calib/std_conf": 0.0837418318404846,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5657848739495798,
"calib/step_q_c_n": 595.0,
"calib/step_q_gap": 0.0682291092436974,
"calib/step_q_w": 0.49755576470588236,
"calib/step_q_w_n": 765.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2389.0,
"completions/max_terminated_length": 2389.0,
"completions/mean_length": 490.96875,
"completions/mean_terminated_length": 494.83465576171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.036133233457803726,
"kl": 0.0560760498046875,
"learning_rate": 3.833333333333334e-06,
"loss": 0.0055,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.033206403255462646,
"mask/share_reasoning": 0.8455889225006104,
"mask/share_step_conf": 0.11339214444160461,
"num_tokens": 14756624.0,
"reward": 1.1832115650177002,
"reward_std": 0.3248516917228699,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.5339019894599915,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7697761058807373,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.7545422911643982,
"adv/mean_abs_reasoning": 0.5572085380554199,
"adv/mean_abs_step_conf": 0.7710795998573303,
"adv/ratio_final_to_reasoning": 1.3541470376560372,
"adv/ratio_step_to_reasoning": 1.383825887787525,
"adv/std_final_conf": 0.9127007722854614,
"adv/std_reasoning": 0.7927935123443604,
"adv/std_step_conf": 0.936107873916626,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6145327858653726,
"calib/avg_num_step_conf": 4.68359375,
"calib/ece": 0.40417254901960775,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9137254901960784,
"calib/gap": 0.01083184023889483,
"calib/mean_conf": 0.9480156862745098,
"calib/mu_c": 0.9528581560283687,
"calib/mu_w": 0.9420263157894738,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.39962352941176466,
"calib/std_conf": 0.05407873480271483,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5841763826606875,
"calib/step_q_c_n": 669.0,
"calib/step_q_gap": 0.05211977888710262,
"calib/step_q_w": 0.5320566037735849,
"calib/step_q_w_n": 530.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1324.0,
"completions/max_terminated_length": 1324.0,
"completions/mean_length": 488.4609375,
"completions/mean_terminated_length": 490.3764953613281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.0672,
"grad_norm": 0.025932716205716133,
"kl": 0.0595855712890625,
"learning_rate": 3.8055555555555556e-06,
"loss": -0.0468,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.0346166267991066,
"mask/share_reasoning": 0.8508518934249878,
"mask/share_step_conf": 0.1106252670288086,
"num_tokens": 14990310.0,
"reward": 1.2687819004058838,
"reward_std": 0.27470365166664124,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.5966545343399048,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8161576986312866,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.7520545125007629,
"adv/mean_abs_reasoning": 0.521262526512146,
"adv/mean_abs_step_conf": 0.7765277624130249,
"adv/ratio_final_to_reasoning": 1.4427557598143192,
"adv/ratio_step_to_reasoning": 1.4897057104965916,
"adv/std_final_conf": 0.9201980233192444,
"adv/std_reasoning": 0.7753161787986755,
"adv/std_step_conf": 0.9359229803085327,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5721966205837173,
"calib/avg_num_step_conf": 4.26953125,
"calib/ece": 0.3141129921259842,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.905511811023622,
"calib/gap": 0.01033412809724188,
"calib/mean_conf": 0.9356901574803149,
"calib/mu_c": 0.9394739130434783,
"calib/mu_w": 0.9291397849462364,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3079724409448818,
"calib/std_conf": 0.10479543450027383,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5752903790087464,
"calib/step_q_c_n": 686.0,
"calib/step_q_gap": 0.017683499401866754,
"calib/step_q_w": 0.5576068796068796,
"calib/step_q_w_n": 407.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2185.0,
"completions/max_terminated_length": 2185.0,
"completions/mean_length": 434.5546875,
"completions/mean_terminated_length": 436.25885009765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.035325441509485245,
"kl": 0.06046295166015625,
"learning_rate": 3.777777777777778e-06,
"loss": 0.0079,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0373111218214035,
"mask/share_reasoning": 0.8508127927780151,
"mask/share_step_conf": 0.10796987265348434,
"num_tokens": 15205332.0,
"reward": 1.3001196384429932,
"reward_std": 0.26134374737739563,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6653909087181091,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8053147792816162,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.7708741426467896,
"adv/mean_abs_reasoning": 0.4588015675544739,
"adv/mean_abs_step_conf": 0.7731654047966003,
"adv/ratio_final_to_reasoning": 1.6801907342116111,
"adv/ratio_step_to_reasoning": 1.6851847497334493,
"adv/std_final_conf": 0.9213289618492126,
"adv/std_reasoning": 0.720514178276062,
"adv/std_step_conf": 0.936091959476471,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5939365671641791,
"calib/avg_num_step_conf": 4.12109375,
"calib/ece": 0.4219133858267716,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9251968503937008,
"calib/gap": 0.02082661691542287,
"calib/mean_conf": 0.947503937007874,
"calib/mu_c": 0.9573432835820895,
"calib/mu_w": 0.9365166666666667,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4209291338582677,
"calib/std_conf": 0.07623801044405916,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5993211678832117,
"calib/step_q_c_n": 548.0,
"calib/step_q_gap": 0.03390381088123928,
"calib/step_q_w": 0.5654173570019724,
"calib/step_q_w_n": 507.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 990.0,
"completions/max_terminated_length": 990.0,
"completions/mean_length": 407.1328125,
"completions/mean_terminated_length": 408.72943115234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.03840414434671402,
"kl": 0.06768035888671875,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0382,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.039253175258636475,
"mask/share_reasoning": 0.8444764614105225,
"mask/share_step_conf": 0.11236413568258286,
"num_tokens": 15414582.0,
"reward": 1.2306933403015137,
"reward_std": 0.23277725279331207,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.5744519829750061,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7915141582489014,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.7484292984008789,
"adv/mean_abs_reasoning": 0.489560604095459,
"adv/mean_abs_step_conf": 0.7633772492408752,
"adv/ratio_final_to_reasoning": 1.5287776265897068,
"adv/ratio_step_to_reasoning": 1.559311028818048,
"adv/std_final_conf": 0.9219995141029358,
"adv/std_reasoning": 0.7574658989906311,
"adv/std_step_conf": 0.935945451259613,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6697958669354838,
"calib/avg_num_step_conf": 4.7109375,
"calib/ece": 0.4382460317460317,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7976190476190477,
"calib/gap": 0.03244002016129033,
"calib/mean_conf": 0.9261031746031746,
"calib/mu_c": 0.9425806451612903,
"calib/mu_w": 0.9101406249999999,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.43614285714285717,
"calib/std_conf": 0.07387616508522672,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5608768670309654,
"calib/step_q_c_n": 549.0,
"calib/step_q_gap": 0.04943090051650567,
"calib/step_q_w": 0.5114459665144597,
"calib/step_q_w_n": 657.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2763.0,
"completions/max_terminated_length": 2763.0,
"completions/mean_length": 499.89453125,
"completions/mean_terminated_length": 501.85491943359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.0704,
"grad_norm": 0.0336519218981266,
"kl": 0.060970306396484375,
"learning_rate": 3.7222222222222225e-06,
"loss": 0.0229,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.035479508340358734,
"mask/share_reasoning": 0.8528700470924377,
"mask/share_step_conf": 0.10774420201778412,
"num_tokens": 15648907.0,
"reward": 1.227863073348999,
"reward_std": 0.23462948203086853,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.5634865760803223,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7992448806762695,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.7395097613334656,
"adv/mean_abs_reasoning": 0.4111238718032837,
"adv/mean_abs_step_conf": 0.7691671848297119,
"adv/ratio_final_to_reasoning": 1.798751695175972,
"adv/ratio_step_to_reasoning": 1.870889134839016,
"adv/std_final_conf": 0.9214187264442444,
"adv/std_reasoning": 0.7012759447097778,
"adv/std_step_conf": 0.936107337474823,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6463777285669092,
"calib/avg_num_step_conf": 4.25390625,
"calib/ece": 0.3530314960629921,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.8188976377952756,
"calib/gap": 0.03838721923441957,
"calib/mean_conf": 0.9203543307086614,
"calib/mu_c": 0.9368275862068965,
"calib/mu_w": 0.898440366972477,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.351259842519685,
"calib/std_conf": 0.1045672768825448,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6015963963963964,
"calib/step_q_c_n": 555.0,
"calib/step_q_gap": 0.09310605868532695,
"calib/step_q_w": 0.5084903377110694,
"calib/step_q_w_n": 533.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1676.0,
"completions/max_terminated_length": 1676.0,
"completions/mean_length": 457.13671875,
"completions/mean_terminated_length": 458.929443359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.030173419043421745,
"kl": 0.05756378173828125,
"learning_rate": 3.694444444444445e-06,
"loss": -0.0259,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03676186501979828,
"mask/share_reasoning": 0.8619298934936523,
"mask/share_step_conf": 0.0974019393324852,
"num_tokens": 15870942.0,
"reward": 1.2942092418670654,
"reward_std": 0.24686667323112488,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6318415999412537,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.822819709777832,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.7641202211380005,
"adv/mean_abs_reasoning": 0.40942007303237915,
"adv/mean_abs_step_conf": 0.7585666179656982,
"adv/ratio_final_to_reasoning": 1.8663477232040104,
"adv/ratio_step_to_reasoning": 1.8527831631394065,
"adv/std_final_conf": 0.9237704873085022,
"adv/std_reasoning": 0.6815453171730042,
"adv/std_step_conf": 0.9358956217765808,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6691687344913152,
"calib/avg_num_step_conf": 4.2421875,
"calib/ece": 0.40511811023622046,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7598425196850394,
"calib/gap": 0.053322580645161155,
"calib/mean_conf": 0.9119685039370078,
"calib/mu_c": 0.9380000000000001,
"calib/mu_w": 0.8846774193548389,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4026377952755905,
"calib/std_conf": 0.11687896269809996,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5686440162271805,
"calib/step_q_c_n": 493.0,
"calib/step_q_gap": 0.05511568570441483,
"calib/step_q_w": 0.5135283305227657,
"calib/step_q_w_n": 593.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2353.0,
"completions/max_terminated_length": 2353.0,
"completions/mean_length": 437.6640625,
"completions/mean_terminated_length": 437.6640625,
"completions/min_length": 107.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.03539825975894928,
"kl": 0.06148529052734375,
"learning_rate": 3.6666666666666666e-06,
"loss": -0.0165,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.040313176810741425,
"mask/share_reasoning": 0.8507459759712219,
"mask/share_step_conf": 0.10894083976745605,
"num_tokens": 16087072.0,
"reward": 1.240929365158081,
"reward_std": 0.21346575021743774,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.5982882976531982,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.7917851805686951,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.7670649886131287,
"adv/mean_abs_reasoning": 0.5342639088630676,
"adv/mean_abs_step_conf": 0.7311890721321106,
"adv/ratio_final_to_reasoning": 1.435741729673393,
"adv/ratio_step_to_reasoning": 1.3685915518570337,
"adv/std_final_conf": 0.9319685101509094,
"adv/std_reasoning": 0.7928503155708313,
"adv/std_step_conf": 0.9363734126091003,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6099412182675538,
"calib/avg_num_step_conf": 4.140625,
"calib/ece": 0.44009600000000004,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.62,
"calib/gap": 0.04333621859053027,
"calib/mean_conf": 0.891296,
"calib/mu_c": 0.9150442477876106,
"calib/mu_w": 0.8717080291970803,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.4396960000000001,
"calib/std_conf": 0.11289492629874914,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5273900862068965,
"calib/step_q_c_n": 464.0,
"calib/step_q_gap": 0.033377313097652905,
"calib/step_q_w": 0.49401277310924363,
"calib/step_q_w_n": 595.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2902.0,
"completions/max_terminated_length": 2902.0,
"completions/mean_length": 507.69921875,
"completions/mean_terminated_length": 513.7193603515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.0736,
"grad_norm": 0.03430643677711487,
"kl": 0.05869293212890625,
"learning_rate": 3.638888888888889e-06,
"loss": -0.1387,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03412723168730736,
"mask/share_reasoning": 0.8615804314613342,
"mask/share_step_conf": 0.0925736129283905,
"num_tokens": 16321539.0,
"reward": 1.1938366889953613,
"reward_std": 0.3026812672615051,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.5487052202224731,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7792496681213379,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.7933931350708008,
"adv/mean_abs_reasoning": 0.4543910026550293,
"adv/mean_abs_step_conf": 0.7425044775009155,
"adv/ratio_final_to_reasoning": 1.7460581975324447,
"adv/ratio_step_to_reasoning": 1.6340650962770495,
"adv/std_final_conf": 0.925523579120636,
"adv/std_reasoning": 0.7205556035041809,
"adv/std_step_conf": 0.9361271858215332,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7422693750808642,
"calib/avg_num_step_conf": 4.26953125,
"calib/ece": 0.4013895582329316,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6104417670682731,
"calib/gap": 0.1102944753525682,
"calib/mean_conf": 0.8707871485943776,
"calib/mu_c": 0.9288135593220338,
"calib/mu_w": 0.8185190839694656,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.39914056224899586,
"calib/std_conf": 0.16316327987356224,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5501617107942974,
"calib/step_q_c_n": 491.0,
"calib/step_q_gap": 0.09954543172453006,
"calib/step_q_w": 0.45061627906976737,
"calib/step_q_w_n": 602.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2272.0,
"completions/max_terminated_length": 2272.0,
"completions/mean_length": 477.78125,
"completions/mean_terminated_length": 479.6549377441406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.04866304621100426,
"kl": 0.065460205078125,
"learning_rate": 3.6111111111111115e-06,
"loss": -0.0177,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.0366334542632103,
"mask/share_reasoning": 0.8547972440719604,
"mask/share_step_conf": 0.10466301441192627,
"num_tokens": 16550843.0,
"reward": 1.241321325302124,
"reward_std": 0.2550710141658783,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.6034541130065918,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7966254353523254,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.7791687250137329,
"adv/mean_abs_reasoning": 0.6073622703552246,
"adv/mean_abs_step_conf": 0.7619677186012268,
"adv/ratio_final_to_reasoning": 1.2828731105704423,
"adv/ratio_step_to_reasoning": 1.2545522759515157,
"adv/std_final_conf": 0.9324753284454346,
"adv/std_reasoning": 0.8266918659210205,
"adv/std_step_conf": 0.9363065361976624,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5827046035805626,
"calib/avg_num_step_conf": 4.828125,
"calib/ece": 0.35780478087649403,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6215139442231076,
"calib/gap": 0.012531713554987278,
"calib/mean_conf": 0.8823466135458167,
"calib/mu_c": 0.8880882352941176,
"calib/mu_w": 0.8755565217391303,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3491593625498008,
"calib/std_conf": 0.12245338725525283,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4922910743801653,
"calib/step_q_c_n": 605.0,
"calib/step_q_gap": 0.04487443412659947,
"calib/step_q_w": 0.4474166402535658,
"calib/step_q_w_n": 631.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1548.0,
"completions/max_terminated_length": 1548.0,
"completions/mean_length": 443.1484375,
"completions/mean_terminated_length": 448.4031677246094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.03445587679743767,
"kl": 0.0860137939453125,
"learning_rate": 3.5833333333333335e-06,
"loss": -0.1582,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03762516379356384,
"mask/share_reasoning": 0.8341405987739563,
"mask/share_step_conf": 0.11651550978422165,
"num_tokens": 16768697.0,
"reward": 1.251643180847168,
"reward_std": 0.3011441230773926,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6108843088150024,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7954198122024536,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.7744049429893494,
"adv/mean_abs_reasoning": 0.5842318534851074,
"adv/mean_abs_step_conf": 0.7654081583023071,
"adv/ratio_final_to_reasoning": 1.3255096215137978,
"adv/ratio_step_to_reasoning": 1.3101102819649975,
"adv/std_final_conf": 0.9310498833656311,
"adv/std_reasoning": 0.8099108338356018,
"adv/std_step_conf": 0.9359899163246155,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6381246490735542,
"calib/avg_num_step_conf": 4.42578125,
"calib/ece": 0.34905511811023626,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6141732283464567,
"calib/gap": 0.06327905670971357,
"calib/mean_conf": 0.8756692913385827,
"calib/mu_c": 0.9048175182481751,
"calib/mu_w": 0.8415384615384616,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3426771653543308,
"calib/std_conf": 0.1406961911028027,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5198737322893363,
"calib/step_q_c_n": 596.0,
"calib/step_q_gap": 0.06832252186103088,
"calib/step_q_w": 0.4515512104283054,
"calib/step_q_w_n": 537.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2565.0,
"completions/max_terminated_length": 2565.0,
"completions/mean_length": 444.05078125,
"completions/mean_terminated_length": 444.05078125,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.0768,
"grad_norm": 0.04978213459253311,
"kl": 0.06729888916015625,
"learning_rate": 3.555555555555556e-06,
"loss": -0.0508,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03684322535991669,
"mask/share_reasoning": 0.8544652462005615,
"mask/share_step_conf": 0.10869147628545761,
"num_tokens": 16986782.0,
"reward": 1.319218635559082,
"reward_std": 0.25763240456581116,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6421718597412109,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8457888960838318,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.7550079226493835,
"adv/mean_abs_reasoning": 0.4658733606338501,
"adv/mean_abs_step_conf": 0.7642512321472168,
"adv/ratio_final_to_reasoning": 1.620629094615214,
"adv/ratio_step_to_reasoning": 1.6404699146295998,
"adv/std_final_conf": 0.9236007928848267,
"adv/std_reasoning": 0.7205851674079895,
"adv/std_step_conf": 0.9357803463935852,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7105079365079365,
"calib/avg_num_step_conf": 4.265625,
"calib/ece": 0.2807015686274509,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.596078431372549,
"calib/gap": 0.10843876190476176,
"calib/mean_conf": 0.8623494117647059,
"calib/mu_c": 0.9070006666666666,
"calib/mu_w": 0.7985619047619048,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2774078431372548,
"calib/std_conf": 0.16312133673941612,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5269798657718121,
"calib/step_q_c_n": 596.0,
"calib/step_q_gap": 0.059091156094392694,
"calib/step_q_w": 0.4678887096774194,
"calib/step_q_w_n": 496.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2474.0,
"completions/max_terminated_length": 2474.0,
"completions/mean_length": 429.98046875,
"completions/mean_terminated_length": 429.98046875,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.047100406140089035,
"kl": 0.06757354736328125,
"learning_rate": 3.5277777777777784e-06,
"loss": -0.0143,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03696557879447937,
"mask/share_reasoning": 0.855215311050415,
"mask/share_step_conf": 0.10781913995742798,
"num_tokens": 17203889.0,
"reward": 1.321640968322754,
"reward_std": 0.21188318729400635,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7058013677597046,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8105372190475464,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.7607376575469971,
"adv/mean_abs_reasoning": 0.482050359249115,
"adv/mean_abs_step_conf": 0.7662625908851624,
"adv/ratio_final_to_reasoning": 1.5781290127695176,
"adv/ratio_step_to_reasoning": 1.589590332592557,
"adv/std_final_conf": 0.930891215801239,
"adv/std_reasoning": 0.7394124269485474,
"adv/std_step_conf": 0.9361252784729004,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6418861788617887,
"calib/avg_num_step_conf": 3.8671875,
"calib/ece": 0.3153104838709677,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.41935483870967744,
"calib/gap": 0.07578308943089429,
"calib/mean_conf": 0.7997459677419354,
"calib/mu_c": 0.8379430894308944,
"calib/mu_w": 0.7621600000000001,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.3095443548387097,
"calib/std_conf": 0.19638397415858946,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.48399766355140184,
"calib/step_q_c_n": 428.0,
"calib/step_q_gap": 0.05058823294641962,
"calib/step_q_w": 0.4334094306049822,
"calib/step_q_w_n": 562.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1603.0,
"completions/max_terminated_length": 1603.0,
"completions/mean_length": 415.4609375,
"completions/mean_terminated_length": 420.3873596191406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.039140306413173676,
"kl": 0.0781707763671875,
"learning_rate": 3.5e-06,
"loss": -0.1344,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.04051174968481064,
"mask/share_reasoning": 0.8400492668151855,
"mask/share_step_conf": 0.10772022604942322,
"num_tokens": 17414175.0,
"reward": 1.267834186553955,
"reward_std": 0.26047077775001526,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.6346431970596313,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8059812784194946,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.7445878982543945,
"adv/mean_abs_reasoning": 0.4395519495010376,
"adv/mean_abs_step_conf": 0.7424792051315308,
"adv/ratio_final_to_reasoning": 1.6939701873683461,
"adv/ratio_step_to_reasoning": 1.68917281785319,
"adv/std_final_conf": 0.9286428093910217,
"adv/std_reasoning": 0.7013537883758545,
"adv/std_step_conf": 0.9361192584037781,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.77823029796714,
"calib/avg_num_step_conf": 3.9296875,
"calib/ece": 0.17942352941176468,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6,
"calib/gap": 0.19322034252297393,
"calib/mean_conf": 0.840678431372549,
"calib/mu_c": 0.9043274853801169,
"calib/mu_w": 0.7111071428571429,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.174756862745098,
"calib/std_conf": 0.19853079606392227,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5372239263803681,
"calib/step_q_c_n": 652.0,
"calib/step_q_gap": 0.14038211847076365,
"calib/step_q_w": 0.39684180790960444,
"calib/step_q_w_n": 354.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 844.0,
"completions/max_terminated_length": 844.0,
"completions/mean_length": 386.0234375,
"completions/mean_terminated_length": 387.53729248046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.08,
"grad_norm": 0.03852907568216324,
"kl": 0.08367156982421875,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.057,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04084809869527817,
"mask/share_reasoning": 0.8442814350128174,
"mask/share_step_conf": 0.11096422374248505,
"num_tokens": 17617749.0,
"reward": 1.4051578044891357,
"reward_std": 0.22026976943016052,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7908572554588318,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8437135219573975,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.7429937124252319,
"adv/mean_abs_reasoning": 0.48358088731765747,
"adv/mean_abs_step_conf": 0.7725791931152344,
"adv/ratio_final_to_reasoning": 1.5364414349510258,
"adv/ratio_step_to_reasoning": 1.5976214390949202,
"adv/std_final_conf": 0.912419319152832,
"adv/std_reasoning": 0.7574656009674072,
"adv/std_step_conf": 0.9359506368637085,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7136080148619958,
"calib/avg_num_step_conf": 4.09375,
"calib/ece": 0.17936758893280635,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4031620553359684,
"calib/gap": 0.14742104564755842,
"calib/mean_conf": 0.7776284584980238,
"calib/mu_c": 0.8335668789808918,
"calib/mu_w": 0.6861458333333333,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.16822134387351778,
"calib/std_conf": 0.22366668696544437,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.45767772511848337,
"calib/step_q_c_n": 633.0,
"calib/step_q_gap": 0.06006808656426649,
"calib/step_q_w": 0.3976096385542169,
"calib/step_q_w_n": 415.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1855.0,
"completions/max_terminated_length": 1855.0,
"completions/mean_length": 425.8515625,
"completions/mean_terminated_length": 425.8515625,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.042207323014736176,
"kl": 0.0843963623046875,
"learning_rate": 3.444444444444445e-06,
"loss": -0.0015,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.040179964154958725,
"mask/share_reasoning": 0.8500825762748718,
"mask/share_step_conf": 0.10973748564720154,
"num_tokens": 17829823.0,
"reward": 1.3595410585403442,
"reward_std": 0.20747733116149902,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7503616809844971,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8242039680480957,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.7458382844924927,
"adv/mean_abs_reasoning": 0.41478484869003296,
"adv/mean_abs_step_conf": 0.7500584125518799,
"adv/ratio_final_to_reasoning": 1.7981329039572866,
"adv/ratio_step_to_reasoning": 1.808307161943602,
"adv/std_final_conf": 0.935461699962616,
"adv/std_reasoning": 0.6613828539848328,
"adv/std_step_conf": 0.9361003637313843,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5812613312613313,
"calib/avg_num_step_conf": 4.06640625,
"calib/ece": 0.192113725490196,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3803921568627451,
"calib/gap": 0.06619327894327887,
"calib/mean_conf": 0.7518078431372549,
"calib/mu_c": 0.7775064102564102,
"calib/mu_w": 0.7113131313131313,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16607843137254896,
"calib/std_conf": 0.2309809226000101,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4324588414634146,
"calib/step_q_c_n": 656.0,
"calib/step_q_gap": -0.027330768926195792,
"calib/step_q_w": 0.4597896103896104,
"calib/step_q_w_n": 385.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1699.0,
"completions/max_terminated_length": 1699.0,
"completions/mean_length": 401.2734375,
"completions/mean_terminated_length": 402.8470764160156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.0602923147380352,
"kl": 0.096343994140625,
"learning_rate": 3.416666666666667e-06,
"loss": -0.0796,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04241577535867691,
"mask/share_reasoning": 0.836897611618042,
"mask/share_step_conf": 0.1167803704738617,
"num_tokens": 18037213.0,
"reward": 1.3238272666931152,
"reward_std": 0.21843001246452332,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7147822380065918,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8062798976898193,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.7273539900779724,
"adv/mean_abs_reasoning": 0.5718014240264893,
"adv/mean_abs_step_conf": 0.7482078075408936,
"adv/ratio_final_to_reasoning": 1.2720394869885407,
"adv/ratio_step_to_reasoning": 1.3085098709132144,
"adv/std_final_conf": 0.9153119921684265,
"adv/std_reasoning": 0.792800784111023,
"adv/std_step_conf": 0.935762345790863,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7362155388471179,
"calib/avg_num_step_conf": 4.5390625,
"calib/ece": 0.20717559055118112,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3937007874015748,
"calib/gap": 0.1877954887218044,
"calib/mean_conf": 0.7583566929133858,
"calib/mu_c": 0.8426428571428571,
"calib/mu_w": 0.6548473684210527,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20717559055118112,
"calib/std_conf": 0.2278602352011284,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.42668166409861324,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.06647562121362294,
"calib/step_q_w": 0.3602060428849903,
"calib/step_q_w_n": 513.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1532.0,
"completions/max_terminated_length": 1532.0,
"completions/mean_length": 461.93359375,
"completions/mean_terminated_length": 463.7451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.0832,
"grad_norm": 0.05932401493191719,
"kl": 0.0837554931640625,
"learning_rate": 3.3888888888888893e-06,
"loss": -0.0694,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03584552928805351,
"mask/share_reasoning": 0.8557673096656799,
"mask/share_step_conf": 0.10448087751865387,
"num_tokens": 18263492.0,
"reward": 1.3682516813278198,
"reward_std": 0.18205977976322174,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7448265552520752,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8419320583343506,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.7352473735809326,
"adv/mean_abs_reasoning": 0.4633867144584656,
"adv/mean_abs_step_conf": 0.7579057216644287,
"adv/ratio_final_to_reasoning": 1.5866820317457213,
"adv/ratio_step_to_reasoning": 1.6355793077713745,
"adv/std_final_conf": 0.9174332022666931,
"adv/std_reasoning": 0.7206430435180664,
"adv/std_step_conf": 0.9358471632003784,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5985389610389611,
"calib/avg_num_step_conf": 4.421875,
"calib/ece": 0.2583897637795275,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.515748031496063,
"calib/gap": 0.07205493506493521,
"calib/mean_conf": 0.8216968503937009,
"calib/mu_c": 0.8500649350649352,
"calib/mu_w": 0.77801,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23689370078740152,
"calib/std_conf": 0.21910718674068766,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43175787965616047,
"calib/step_q_c_n": 698.0,
"calib/step_q_gap": 0.03660234970224341,
"calib/step_q_w": 0.39515552995391706,
"calib/step_q_w_n": 434.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 968.0,
"completions/max_terminated_length": 968.0,
"completions/mean_length": 435.79296875,
"completions/mean_terminated_length": 437.5019836425781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.037078507244586945,
"kl": 0.08056640625,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0305,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.036293767392635345,
"mask/share_reasoning": 0.8540657758712769,
"mask/share_step_conf": 0.10573424398899078,
"num_tokens": 18481431.0,
"reward": 1.331152081489563,
"reward_std": 0.23628367483615875,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6958156824111938,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.824259877204895,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.6860350370407104,
"adv/mean_abs_reasoning": 0.47583746910095215,
"adv/mean_abs_step_conf": 0.7645847797393799,
"adv/ratio_final_to_reasoning": 1.4417423628637438,
"adv/ratio_step_to_reasoning": 1.6068191964453478,
"adv/std_final_conf": 0.8816706538200378,
"adv/std_reasoning": 0.7206332087516785,
"adv/std_step_conf": 0.9358549118041992,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6925,
"calib/avg_num_step_conf": 4.5703125,
"calib/ece": 0.24196078431372542,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6196078431372549,
"calib/gap": 0.14407236842105242,
"calib/mean_conf": 0.8594509803921568,
"calib/mu_c": 0.913125,
"calib/mu_w": 0.7690526315789475,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23698039215686267,
"calib/std_conf": 0.21613469962361845,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4521757345491389,
"calib/step_q_c_n": 658.0,
"calib/step_q_gap": 0.08987951059080557,
"calib/step_q_w": 0.36229622395833333,
"calib/step_q_w_n": 512.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2606.0,
"completions/max_terminated_length": 2606.0,
"completions/mean_length": 415.578125,
"completions/mean_terminated_length": 415.578125,
"completions/min_length": 139.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.08533333333333333,
"grad_norm": 10.169572830200195,
"kl": 62.33721923828125,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.3504,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04010055959224701,
"mask/share_reasoning": 0.8437327742576599,
"mask/share_step_conf": 0.11616663634777069,
"num_tokens": 18689979.0,
"reward": 1.3634624481201172,
"reward_std": 0.21763771772384644,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7301976680755615,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8362541794776917,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.6958885788917542,
"adv/mean_abs_reasoning": 0.4900428056716919,
"adv/mean_abs_step_conf": 0.7568225860595703,
"adv/ratio_final_to_reasoning": 1.4200567192041798,
"adv/ratio_step_to_reasoning": 1.5444009733439688,
"adv/std_final_conf": 0.8862650394439697,
"adv/std_reasoning": 0.7392630577087402,
"adv/std_step_conf": 0.9359573721885681,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6334706959706959,
"calib/avg_num_step_conf": 4.328125,
"calib/ece": 0.2452191601049869,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5669291338582677,
"calib/gap": 0.11300292168149328,
"calib/mean_conf": 0.8070564304461942,
"calib/mu_c": 0.850655982905983,
"calib/mu_w": 0.7376530612244897,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2190511811023622,
"calib/std_conf": 0.25818957283509064,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.44979445407279034,
"calib/step_q_c_n": 577.0,
"calib/step_q_gap": 0.10520500021215001,
"calib/step_q_w": 0.3445894538606403,
"calib/step_q_w_n": 531.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2342.0,
"completions/max_terminated_length": 2342.0,
"completions/mean_length": 438.79296875,
"completions/mean_terminated_length": 440.5137634277344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.0864,
"grad_norm": 0.04086478427052498,
"kl": 0.099609375,
"learning_rate": 3.3055555555555558e-06,
"loss": -0.0251,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04027840867638588,
"mask/share_reasoning": 0.8512026071548462,
"mask/share_step_conf": 0.10461273789405823,
"num_tokens": 18908558.0,
"reward": 1.3497854471206665,
"reward_std": 0.2123834192752838,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7071569561958313,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8360507488250732,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.7277892827987671,
"adv/mean_abs_reasoning": 0.5591062903404236,
"adv/mean_abs_step_conf": 0.7847660779953003,
"adv/ratio_final_to_reasoning": 1.3017011172520296,
"adv/ratio_step_to_reasoning": 1.4036080286585204,
"adv/std_final_conf": 0.9133651852607727,
"adv/std_reasoning": 0.7927923798561096,
"adv/std_step_conf": 0.9358444809913635,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.668380376344086,
"calib/avg_num_step_conf": 4.47265625,
"calib/ece": 0.2806442687747036,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7351778656126482,
"calib/gap": 0.1358327284946237,
"calib/mean_conf": 0.8799881422924901,
"calib/mu_c": 0.92991875,
"calib/mu_w": 0.7940860215053763,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.26411067193675897,
"calib/std_conf": 0.22479581393380432,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4877527047913447,
"calib/step_q_c_n": 647.0,
"calib/step_q_gap": 0.13897298591584273,
"calib/step_q_w": 0.34877971887550197,
"calib/step_q_w_n": 498.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2582.0,
"completions/max_terminated_length": 2582.0,
"completions/mean_length": 377.25,
"completions/mean_terminated_length": 380.220458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.04228287935256958,
"kl": 0.1171722412109375,
"learning_rate": 3.277777777777778e-06,
"loss": -0.0091,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04324019327759743,
"mask/share_reasoning": 0.8272990584373474,
"mask/share_step_conf": 0.12164826691150665,
"num_tokens": 19110686.0,
"reward": 1.3558624982833862,
"reward_std": 0.23602716624736786,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7104343771934509,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8393171429634094,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.6465597152709961,
"adv/mean_abs_reasoning": 0.3706515431404114,
"adv/mean_abs_step_conf": 0.7619996070861816,
"adv/ratio_final_to_reasoning": 1.744386951131792,
"adv/ratio_step_to_reasoning": 2.055838215672877,
"adv/std_final_conf": 0.8661852478981018,
"adv/std_reasoning": 0.640232264995575,
"adv/std_step_conf": 0.9360112547874451,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5799475753604194,
"calib/avg_num_step_conf": 3.99609375,
"calib/ece": 0.3164062500000001,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.6484375,
"calib/gap": 0.08320414404293819,
"calib/mean_conf": 0.841171875,
"calib/mu_c": 0.8765986394557824,
"calib/mu_w": 0.7933944954128442,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.29167968750000006,
"calib/std_conf": 0.24750985174126783,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4914368453481207,
"calib/step_q_c_n": 541.0,
"calib/step_q_gap": 0.10662564202861863,
"calib/step_q_w": 0.3848112033195021,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1184.0,
"completions/max_terminated_length": 1184.0,
"completions/mean_length": 425.54296875,
"completions/mean_terminated_length": 427.2117919921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.038990918546915054,
"kl": 0.1039276123046875,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.1183,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.040513087064027786,
"mask/share_reasoning": 0.8545513153076172,
"mask/share_step_conf": 0.10102932155132294,
"num_tokens": 19326889.0,
"reward": 1.333298683166504,
"reward_std": 0.19381004571914673,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6636687517166138,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8440423607826233,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.7489799857139587,
"adv/mean_abs_reasoning": 0.6053099632263184,
"adv/mean_abs_step_conf": 0.7614016532897949,
"adv/ratio_final_to_reasoning": 1.237349508872901,
"adv/ratio_step_to_reasoning": 1.2578706770850154,
"adv/std_final_conf": 0.9016205072402954,
"adv/std_reasoning": 0.8264942169189453,
"adv/std_step_conf": 0.9361007213592529,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.713712686567164,
"calib/avg_num_step_conf": 4.140625,
"calib/ece": 0.33639107611548535,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6889763779527559,
"calib/gap": 0.1905932835820896,
"calib/mean_conf": 0.8579658792650918,
"calib/mu_c": 0.9480099502487562,
"calib/mu_w": 0.7574166666666666,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3333989501312334,
"calib/std_conf": 0.24192491526825316,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5059726666666666,
"calib/step_q_c_n": 500.0,
"calib/step_q_gap": 0.11430445238095233,
"calib/step_q_w": 0.3916682142857143,
"calib/step_q_w_n": 560.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3060.0,
"completions/max_terminated_length": 3060.0,
"completions/mean_length": 406.8671875,
"completions/mean_terminated_length": 406.8671875,
"completions/min_length": 121.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.0896,
"grad_norm": 0.03670133650302887,
"kl": 0.12044525146484375,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.0093,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04312140494585037,
"mask/share_reasoning": 0.8456448912620544,
"mask/share_step_conf": 0.11123368889093399,
"num_tokens": 19536967.0,
"reward": 1.3182759284973145,
"reward_std": 0.2619035840034485,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6727728843688965,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8303269147872925,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.6797654628753662,
"adv/mean_abs_reasoning": 0.46334075927734375,
"adv/mean_abs_step_conf": 0.7665224671363831,
"adv/ratio_final_to_reasoning": 1.4670961905781232,
"adv/ratio_step_to_reasoning": 1.6543385225420295,
"adv/std_final_conf": 0.8873353004455566,
"adv/std_reasoning": 0.7391656041145325,
"adv/std_step_conf": 0.9362779259681702,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6875468867216804,
"calib/avg_num_step_conf": 3.89453125,
"calib/ece": 0.3937022397891964,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7233201581027668,
"calib/gap": 0.17096878386263237,
"calib/mean_conf": 0.8838208168642951,
"calib/mu_c": 0.9709946236559142,
"calib/mu_w": 0.8000258397932818,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3937022397891964,
"calib/std_conf": 0.21735560772120124,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5151022727272727,
"calib/step_q_c_n": 440.0,
"calib/step_q_gap": 0.10504302676676996,
"calib/step_q_w": 0.41005924596050275,
"calib/step_q_w_n": 557.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2424.0,
"completions/max_terminated_length": 2424.0,
"completions/mean_length": 429.99609375,
"completions/mean_terminated_length": 431.682373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.04405707120895386,
"kl": 0.113250732421875,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.0879,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04101819917559624,
"mask/share_reasoning": 0.850917637348175,
"mask/share_step_conf": 0.10415787994861603,
"num_tokens": 19754870.0,
"reward": 1.2764089107513428,
"reward_std": 0.2609490156173706,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.6228024959564209,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8181326389312744,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.7362145185470581,
"adv/mean_abs_reasoning": 0.5997448563575745,
"adv/mean_abs_step_conf": 0.7307212948799133,
"adv/ratio_final_to_reasoning": 1.2275461985923544,
"adv/ratio_step_to_reasoning": 1.2183869309322584,
"adv/std_final_conf": 0.9218765497207642,
"adv/std_reasoning": 0.8427642583847046,
"adv/std_step_conf": 0.9363045692443848,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6512355333124804,
"calib/avg_num_step_conf": 3.84375,
"calib/ece": 0.3785074803149607,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6811023622047244,
"calib/gap": 0.1717838411010323,
"calib/mean_conf": 0.8218862204724409,
"calib/mu_c": 0.9158939130434782,
"calib/mu_w": 0.7441100719424459,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.37381889763779536,
"calib/std_conf": 0.28613991430444674,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5133126168224299,
"calib/step_q_c_n": 428.0,
"calib/step_q_gap": 0.09592358804545142,
"calib/step_q_w": 0.4173890287769785,
"calib/step_q_w_n": 556.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1843.0,
"completions/max_terminated_length": 1843.0,
"completions/mean_length": 413.6953125,
"completions/mean_terminated_length": 415.3176574707031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.059901315718889236,
"kl": 0.1295318603515625,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.0866,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04301995784044266,
"mask/share_reasoning": 0.8469830751419067,
"mask/share_step_conf": 0.1060907393693924,
"num_tokens": 19966288.0,
"reward": 1.2757947444915771,
"reward_std": 0.3126189708709717,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.6065736413002014,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8291484117507935,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.7256042957305908,
"adv/mean_abs_reasoning": 0.48823797702789307,
"adv/mean_abs_step_conf": 0.7334685921669006,
"adv/ratio_final_to_reasoning": 1.486169306508365,
"adv/ratio_step_to_reasoning": 1.5022768131062396,
"adv/std_final_conf": 0.9046579003334045,
"adv/std_reasoning": 0.7576124668121338,
"adv/std_step_conf": 0.9361890554428101,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6175992192582953,
"calib/avg_num_step_conf": 3.4921875,
"calib/ece": 0.33095617529880467,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7529880478087649,
"calib/gap": 0.10410865322055973,
"calib/mean_conf": 0.8994820717131474,
"calib/mu_c": 0.943448275862069,
"calib/mu_w": 0.8393396226415093,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3263745019920318,
"calib/std_conf": 0.19804455079826683,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5461728155339806,
"calib/step_q_c_n": 515.0,
"calib/step_q_gap": 0.0419955068268566,
"calib/step_q_w": 0.504177308707124,
"calib/step_q_w_n": 379.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2846.0,
"completions/max_terminated_length": 2846.0,
"completions/mean_length": 374.390625,
"completions/mean_terminated_length": 377.3385925292969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.0928,
"grad_norm": 0.0383741669356823,
"kl": 0.1442108154296875,
"learning_rate": 3.138888888888889e-06,
"loss": -0.0843,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.04641494154930115,
"mask/share_reasoning": 0.8423793315887451,
"mask/share_step_conf": 0.10339324176311493,
"num_tokens": 20167628.0,
"reward": 1.2959322929382324,
"reward_std": 0.28941088914871216,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6510910391807556,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8156992197036743,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.6736510396003723,
"adv/mean_abs_reasoning": 0.5277516841888428,
"adv/mean_abs_step_conf": 0.7695959806442261,
"adv/ratio_final_to_reasoning": 1.2764545519845714,
"adv/ratio_step_to_reasoning": 1.458253954844501,
"adv/std_final_conf": 0.8730496168136597,
"adv/std_reasoning": 0.75759357213974,
"adv/std_step_conf": 0.9361079335212708,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7334702907711756,
"calib/avg_num_step_conf": 3.828125,
"calib/ece": 0.2642371541501976,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.616600790513834,
"calib/gap": 0.21584070796460175,
"calib/mean_conf": 0.8175968379446641,
"calib/mu_c": 0.9139999999999999,
"calib/mu_w": 0.6981592920353982,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2642371541501976,
"calib/std_conf": 0.2569745137196758,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.49471593090211136,
"calib/step_q_c_n": 521.0,
"calib/step_q_gap": 0.06141222719840761,
"calib/step_q_w": 0.43330370370370375,
"calib/step_q_w_n": 459.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1968.0,
"completions/max_terminated_length": 1968.0,
"completions/mean_length": 426.65625,
"completions/mean_terminated_length": 428.3294372558594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.028476638719439507,
"kl": 0.191864013671875,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.0005,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.039456646889448166,
"mask/share_reasoning": 0.8581996560096741,
"mask/share_step_conf": 0.09843742847442627,
"num_tokens": 20386700.0,
"reward": 1.3378771543502808,
"reward_std": 0.25673359632492065,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7152007222175598,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8271518349647522,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.6247692704200745,
"adv/mean_abs_reasoning": 0.36448341608047485,
"adv/mean_abs_step_conf": 0.7542611956596375,
"adv/ratio_final_to_reasoning": 1.7141226263148572,
"adv/ratio_step_to_reasoning": 2.0693978446830155,
"adv/std_final_conf": 0.8439804315567017,
"adv/std_reasoning": 0.6612530946731567,
"adv/std_step_conf": 0.9361677169799805,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7321204122674712,
"calib/avg_num_step_conf": 3.484375,
"calib/ece": 0.33624505928853765,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5454545454545454,
"calib/gap": 0.21677369281045744,
"calib/mean_conf": 0.786695652173913,
"calib/mu_c": 0.9032222222222221,
"calib/mu_w": 0.6864485294117647,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.33024505928853765,
"calib/std_conf": 0.26675508422771166,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5419005376344086,
"calib/step_q_c_n": 372.0,
"calib/step_q_gap": 0.10425822994210088,
"calib/step_q_w": 0.4376423076923077,
"calib/step_q_w_n": 520.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2029.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 409.45703125,
"completions/mean_terminated_length": 409.45703125,
"completions/min_length": 90.0,
"completions/min_terminated_length": 90.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.06513398885726929,
"kl": 0.1344451904296875,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.0754,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04523952305316925,
"mask/share_reasoning": 0.8543978929519653,
"mask/share_step_conf": 0.10036254674196243,
"num_tokens": 20600409.0,
"reward": 1.3104181289672852,
"reward_std": 0.24107292294502258,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.6709932088851929,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8307808637619019,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.7081605195999146,
"adv/mean_abs_reasoning": 0.5528960227966309,
"adv/mean_abs_step_conf": 0.7596926689147949,
"adv/ratio_final_to_reasoning": 1.2808204262673706,
"adv/ratio_step_to_reasoning": 1.3740244776443782,
"adv/std_final_conf": 0.9066979885101318,
"adv/std_reasoning": 0.7927749752998352,
"adv/std_step_conf": 0.9362542033195496,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6580491722263404,
"calib/avg_num_step_conf": 3.84375,
"calib/ece": 0.2879296875,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.58984375,
"calib/gap": 0.1431813689152459,
"calib/mean_conf": 0.8079296874999999,
"calib/mu_c": 0.8716901408450705,
"calib/mu_w": 0.7285087719298246,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2705859375,
"calib/std_conf": 0.26329494759898514,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5204888268156426,
"calib/step_q_c_n": 537.0,
"calib/step_q_gap": 0.0747061646232488,
"calib/step_q_w": 0.44578266219239376,
"calib/step_q_w_n": 447.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1112.0,
"completions/max_terminated_length": 1112.0,
"completions/mean_length": 385.65234375,
"completions/mean_terminated_length": 387.16473388671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.096,
"grad_norm": 0.030344147235155106,
"kl": 0.154571533203125,
"learning_rate": 3.055555555555556e-06,
"loss": -0.0262,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04306305944919586,
"mask/share_reasoning": 0.8437785506248474,
"mask/share_step_conf": 0.10925211012363434,
"num_tokens": 20802456.0,
"reward": 1.3113610744476318,
"reward_std": 0.24878989160060883,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6869570016860962,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8128043413162231,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.7299604415893555,
"adv/mean_abs_reasoning": 0.520104169845581,
"adv/mean_abs_step_conf": 0.7571775913238525,
"adv/ratio_final_to_reasoning": 1.403488923778636,
"adv/ratio_step_to_reasoning": 1.4558191132146827,
"adv/std_final_conf": 0.9214328527450562,
"adv/std_reasoning": 0.7926962375640869,
"adv/std_step_conf": 0.9360818266868591,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6639589704383282,
"calib/avg_num_step_conf": 3.62109375,
"calib/ece": 0.22901185770750992,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4505928853754941,
"calib/gap": 0.15232415902140672,
"calib/mean_conf": 0.7577075098814229,
"calib/mu_c": 0.8233333333333334,
"calib/mu_w": 0.6710091743119266,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20877470355731229,
"calib/std_conf": 0.25543012128681636,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.48703594080338264,
"calib/step_q_c_n": 473.0,
"calib/step_q_gap": 0.046277791023646986,
"calib/step_q_w": 0.44075814977973565,
"calib/step_q_w_n": 454.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1460.0,
"completions/max_terminated_length": 1460.0,
"completions/mean_length": 377.328125,
"completions/mean_terminated_length": 378.807861328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.03573990985751152,
"kl": 0.1767578125,
"learning_rate": 3.0277777777777776e-06,
"loss": -0.0513,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04190374165773392,
"mask/share_reasoning": 0.8539397716522217,
"mask/share_step_conf": 0.100250244140625,
"num_tokens": 21006764.0,
"reward": 1.3354003429412842,
"reward_std": 0.236099511384964,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.720158576965332,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.820242702960968,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.7487197518348694,
"adv/mean_abs_reasoning": 0.6103894114494324,
"adv/mean_abs_step_conf": 0.7408183813095093,
"adv/ratio_final_to_reasoning": 1.226626376196398,
"adv/ratio_step_to_reasoning": 1.2136815734571147,
"adv/std_final_conf": 0.9200137853622437,
"adv/std_reasoning": 0.8265608549118042,
"adv/std_step_conf": 0.9363206028938293,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6584266169154229,
"calib/avg_num_step_conf": 3.48046875,
"calib/ece": 0.23157086614173222,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.39763779527559057,
"calib/gap": 0.14667537313432855,
"calib/mean_conf": 0.7591299212598425,
"calib/mu_c": 0.8284253731343285,
"calib/mu_w": 0.68175,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.23157086614173222,
"calib/std_conf": 0.23715097613687144,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5830169491525424,
"calib/step_q_c_n": 413.0,
"calib/step_q_gap": 0.09549602865045037,
"calib/step_q_w": 0.48752092050209206,
"calib/step_q_w_n": 478.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1140.0,
"completions/max_terminated_length": 1140.0,
"completions/mean_length": 364.46484375,
"completions/mean_terminated_length": 365.8941345214844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 90.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.043616894632577896,
"kl": 0.164886474609375,
"learning_rate": 3e-06,
"loss": -0.0171,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04630187898874283,
"mask/share_reasoning": 0.8456723690032959,
"mask/share_step_conf": 0.10411947965621948,
"num_tokens": 21206787.0,
"reward": 1.3230915069580078,
"reward_std": 0.2727872133255005,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7084305286407471,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.817704439163208,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.7456616163253784,
"adv/mean_abs_reasoning": 0.6048349142074585,
"adv/mean_abs_step_conf": 0.7437169551849365,
"adv/ratio_final_to_reasoning": 1.2328349419154336,
"adv/ratio_step_to_reasoning": 1.2296197486539964,
"adv/std_final_conf": 0.9214170575141907,
"adv/std_reasoning": 0.8430580496788025,
"adv/std_step_conf": 0.9363595843315125,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6939876670092497,
"calib/avg_num_step_conf": 3.70703125,
"calib/ece": 0.2401366533864542,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.26294820717131473,
"calib/gap": 0.17083301002055495,
"calib/mean_conf": 0.6627318725099602,
"calib/mu_c": 0.7573366071428571,
"calib/mu_w": 0.5865035971223022,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.22832669322709165,
"calib/std_conf": 0.2609951507313089,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5360590260285474,
"calib/step_q_c_n": 397.0,
"calib/step_q_gap": 0.08241591008651838,
"calib/step_q_w": 0.453643115942029,
"calib/step_q_w_n": 552.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2052.0,
"completions/max_terminated_length": 2052.0,
"completions/mean_length": 419.4453125,
"completions/mean_terminated_length": 419.4453125,
"completions/min_length": 126.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.0992,
"grad_norm": 0.043325275182724,
"kl": 0.145263671875,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.0145,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.04405222460627556,
"mask/share_reasoning": 0.8505038619041443,
"mask/share_step_conf": 0.10544390976428986,
"num_tokens": 21419941.0,
"reward": 1.288401484489441,
"reward_std": 0.2714729905128479,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.7044404745101929,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7951655983924866,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.7081672549247742,
"adv/mean_abs_reasoning": 0.5124787092208862,
"adv/mean_abs_step_conf": 0.7548666000366211,
"adv/ratio_final_to_reasoning": 1.3818471717613212,
"adv/ratio_step_to_reasoning": 1.4729716307321987,
"adv/std_final_conf": 0.9206733703613281,
"adv/std_reasoning": 0.7575570344924927,
"adv/std_step_conf": 0.936098575592041,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7496691245982227,
"calib/avg_num_step_conf": 3.39453125,
"calib/ece": 0.15350000000000003,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.2222222222222222,
"calib/gap": 0.21518283229343915,
"calib/mean_conf": 0.6507222222222223,
"calib/mu_c": 0.755751937984496,
"calib/mu_w": 0.5405691056910569,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14615873015873018,
"calib/std_conf": 0.2485085076915109,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.540849765258216,
"calib/step_q_c_n": 426.0,
"calib/step_q_gap": 0.10967595035979621,
"calib/step_q_w": 0.4311738148984198,
"calib/step_q_w_n": 443.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2063.0,
"completions/max_terminated_length": 2063.0,
"completions/mean_length": 373.30859375,
"completions/mean_terminated_length": 374.7725830078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.03856610134243965,
"kl": 0.167999267578125,
"learning_rate": 2.944444444444445e-06,
"loss": -0.0001,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04606093466281891,
"mask/share_reasoning": 0.8470206260681152,
"mask/share_step_conf": 0.10301218926906586,
"num_tokens": 21624188.0,
"reward": 1.3532507419586182,
"reward_std": 0.20355263352394104,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7645102739334106,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8213862180709839,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.7450008392333984,
"adv/mean_abs_reasoning": 0.4954552948474884,
"adv/mean_abs_step_conf": 0.755539059638977,
"adv/ratio_final_to_reasoning": 1.5036691442821808,
"adv/ratio_step_to_reasoning": 1.524938914764344,
"adv/std_final_conf": 0.9205859303474426,
"adv/std_reasoning": 0.7393431067466736,
"adv/std_step_conf": 0.9360067844390869,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7414733542319748,
"calib/avg_num_step_conf": 3.37109375,
"calib/ece": 0.1439254901960784,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.24313725490196078,
"calib/gap": 0.20583134796238245,
"calib/mean_conf": 0.6557686274509804,
"calib/mu_c": 0.7445586206896552,
"calib/mu_w": 0.5387272727272727,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1155333333333333,
"calib/std_conf": 0.2674706166674557,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5160469083155651,
"calib/step_q_c_n": 469.0,
"calib/step_q_gap": 0.08721132455921993,
"calib/step_q_w": 0.4288355837563452,
"calib/step_q_w_n": 394.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1069.0,
"completions/max_terminated_length": 1069.0,
"completions/mean_length": 361.86328125,
"completions/mean_terminated_length": 363.2823791503906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 70.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.07308512181043625,
"kl": 0.1591339111328125,
"learning_rate": 2.916666666666667e-06,
"loss": -0.0879,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.048363469541072845,
"mask/share_reasoning": 0.8412728309631348,
"mask/share_step_conf": 0.10645744204521179,
"num_tokens": 21822953.0,
"reward": 1.375627875328064,
"reward_std": 0.1974714696407318,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7704393863677979,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8345487117767334,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.7327808141708374,
"adv/mean_abs_reasoning": 0.4727819859981537,
"adv/mean_abs_step_conf": 0.7735196352005005,
"adv/ratio_final_to_reasoning": 1.5499338720018387,
"adv/ratio_step_to_reasoning": 1.6361021741710804,
"adv/std_final_conf": 0.933215856552124,
"adv/std_reasoning": 0.7206127047538757,
"adv/std_step_conf": 0.9363253116607666,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7275184599156118,
"calib/avg_num_step_conf": 3.38671875,
"calib/ece": 0.08228346456692916,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2283464566929134,
"calib/gap": 0.2073114451476793,
"calib/mean_conf": 0.6432283464566929,
"calib/mu_c": 0.7215822784810126,
"calib/mu_w": 0.5142708333333333,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.05173228346456696,
"calib/std_conf": 0.26140860672311333,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5292725409836065,
"calib/step_q_c_n": 488.0,
"calib/step_q_gap": 0.10668177581210259,
"calib/step_q_w": 0.42259076517150396,
"calib/step_q_w_n": 379.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2468.0,
"completions/max_terminated_length": 2468.0,
"completions/mean_length": 369.91015625,
"completions/mean_terminated_length": 369.91015625,
"completions/min_length": 106.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.1024,
"grad_norm": 0.058021366596221924,
"kl": 0.1710662841796875,
"learning_rate": 2.888888888888889e-06,
"loss": 0.0321,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.047193411737680435,
"mask/share_reasoning": 0.8507916927337646,
"mask/share_step_conf": 0.10201486945152283,
"num_tokens": 22023466.0,
"reward": 1.3637802600860596,
"reward_std": 0.21904903650283813,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7867922186851501,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8098371624946594,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.7595298290252686,
"adv/mean_abs_reasoning": 0.5360906720161438,
"adv/mean_abs_step_conf": 0.7516697645187378,
"adv/ratio_final_to_reasoning": 1.4167935923391632,
"adv/ratio_step_to_reasoning": 1.4021317731417309,
"adv/std_final_conf": 0.935832679271698,
"adv/std_reasoning": 0.792716920375824,
"adv/std_step_conf": 0.9358479976654053,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7143928035982009,
"calib/avg_num_step_conf": 3.69140625,
"calib/ece": 0.10116929133858268,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.14960629921259844,
"calib/gap": 0.18254960019990008,
"calib/mean_conf": 0.5960511811023622,
"calib/mu_c": 0.6794202898550724,
"calib/mu_w": 0.49687068965517234,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.07695669291338585,
"calib/std_conf": 0.24954816124762283,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4945020746887966,
"calib/step_q_c_n": 482.0,
"calib/step_q_gap": 0.07403490406244662,
"calib/step_q_w": 0.42046717062635,
"calib/step_q_w_n": 463.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1702.0,
"completions/max_terminated_length": 1702.0,
"completions/mean_length": 368.3046875,
"completions/mean_terminated_length": 368.3046875,
"completions/min_length": 115.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.03864547610282898,
"kl": 0.1785888671875,
"learning_rate": 2.861111111111111e-06,
"loss": 0.0053,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.044990718364715576,
"mask/share_reasoning": 0.8436180353164673,
"mask/share_step_conf": 0.11139123886823654,
"num_tokens": 22222824.0,
"reward": 1.3819646835327148,
"reward_std": 0.19518443942070007,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7713358402252197,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8431717157363892,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.7767525911331177,
"adv/mean_abs_reasoning": 0.5913317203521729,
"adv/mean_abs_step_conf": 0.7650718688964844,
"adv/ratio_final_to_reasoning": 1.3135648983459163,
"adv/ratio_step_to_reasoning": 1.293811650152708,
"adv/std_final_conf": 0.9360088109970093,
"adv/std_reasoning": 0.8265880346298218,
"adv/std_step_conf": 0.9362597465515137,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6197610654132394,
"calib/avg_num_step_conf": 3.4453125,
"calib/ece": 0.1542570281124498,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.21285140562248997,
"calib/gap": 0.1063337250293771,
"calib/mean_conf": 0.6276706827309237,
"calib/mu_c": 0.6750724637681159,
"calib/mu_w": 0.5687387387387388,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11385542168674696,
"calib/std_conf": 0.25834232912942434,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5326319444444445,
"calib/step_q_c_n": 480.0,
"calib/step_q_gap": 0.08416677031509129,
"calib/step_q_w": 0.4484651741293532,
"calib/step_q_w_n": 402.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1856.0,
"completions/max_terminated_length": 1856.0,
"completions/mean_length": 389.60546875,
"completions/mean_terminated_length": 391.13336181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.03718774393200874,
"kl": 0.1533050537109375,
"learning_rate": 2.8333333333333335e-06,
"loss": -0.0184,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.04543950408697128,
"mask/share_reasoning": 0.8515037298202515,
"mask/share_step_conf": 0.09915057569742203,
"num_tokens": 22428747.0,
"reward": 1.3166905641555786,
"reward_std": 0.25483328104019165,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7132925987243652,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8077004551887512,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.7712830305099487,
"adv/mean_abs_reasoning": 0.6145671010017395,
"adv/mean_abs_step_conf": 0.76088547706604,
"adv/ratio_final_to_reasoning": 1.2550021458238874,
"adv/ratio_step_to_reasoning": 1.2380836459123874,
"adv/std_final_conf": 0.9343674182891846,
"adv/std_reasoning": 0.8267272710800171,
"adv/std_step_conf": 0.9363002181053162,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7107643758765779,
"calib/avg_num_step_conf": 3.86328125,
"calib/ece": 0.18951821862348178,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.1902834008097166,
"calib/gap": 0.21728204768583437,
"calib/mean_conf": 0.5402793522267206,
"calib/mu_c": 0.6766304347826085,
"calib/mu_w": 0.45934838709677417,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.17866396761133604,
"calib/std_conf": 0.28560527193146484,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4695117493472586,
"calib/step_q_c_n": 383.0,
"calib/step_q_gap": 0.08147379555187906,
"calib/step_q_w": 0.38803795379537953,
"calib/step_q_w_n": 606.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3019.0,
"completions/max_terminated_length": 3019.0,
"completions/mean_length": 477.06640625,
"completions/mean_terminated_length": 478.9372863769531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.1056,
"grad_norm": 0.07817687094211578,
"kl": 0.1505889892578125,
"learning_rate": 2.805555555555556e-06,
"loss": -0.0125,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.039287276566028595,
"mask/share_reasoning": 0.8576635122299194,
"mask/share_step_conf": 0.09914298355579376,
"num_tokens": 22656676.0,
"reward": 1.302371621131897,
"reward_std": 0.27057546377182007,
"rewards/accuracy_reward_step": 0.359375,
"rewards/final_brier_reward_step": 0.7279237508773804,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8063784837722778,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.7553995847702026,
"adv/mean_abs_reasoning": 0.5235493183135986,
"adv/mean_abs_step_conf": 0.7469733953475952,
"adv/ratio_final_to_reasoning": 1.4428432209661077,
"adv/ratio_step_to_reasoning": 1.4267488643737833,
"adv/std_final_conf": 0.9213042855262756,
"adv/std_reasoning": 0.7754214406013489,
"adv/std_step_conf": 0.9361969232559204,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8015468901063486,
"calib/avg_num_step_conf": 3.734375,
"calib/ece": 0.2004761904761905,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.2619047619047619,
"calib/gap": 0.3163764099258783,
"calib/mean_conf": 0.619920634920635,
"calib/mu_c": 0.80196261682243,
"calib/mu_w": 0.4855862068965517,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.19789682539682543,
"calib/std_conf": 0.3019526296069981,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5622406779661017,
"calib/step_q_c_n": 413.0,
"calib/step_q_gap": 0.15584448398206235,
"calib/step_q_w": 0.4063961939840393,
"calib/step_q_w_n": 543.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2323.0,
"completions/max_terminated_length": 2323.0,
"completions/mean_length": 410.984375,
"completions/mean_terminated_length": 412.5960998535156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 91.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.033727023750543594,
"kl": 0.1587677001953125,
"learning_rate": 2.7777777777777783e-06,
"loss": -0.0441,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.0431080237030983,
"mask/share_reasoning": 0.8496963977813721,
"mask/share_step_conf": 0.10328933596611023,
"num_tokens": 22869296.0,
"reward": 1.3796181678771973,
"reward_std": 0.23965272307395935,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.7687491774559021,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.855009138584137,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.7755560874938965,
"adv/mean_abs_reasoning": 0.5660301446914673,
"adv/mean_abs_step_conf": 0.7645591497421265,
"adv/ratio_final_to_reasoning": 1.3701674632834935,
"adv/ratio_step_to_reasoning": 1.350739279369783,
"adv/std_final_conf": 0.9355933666229248,
"adv/std_reasoning": 0.8098204135894775,
"adv/std_step_conf": 0.936090886592865,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7312936481543065,
"calib/avg_num_step_conf": 3.8515625,
"calib/ece": 0.19984523809523808,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.1865079365079365,
"calib/gap": 0.22335370801463256,
"calib/mean_conf": 0.5784960317460318,
"calib/mu_c": 0.7158762886597938,
"calib/mu_w": 0.49252258064516125,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.19671031746031745,
"calib/std_conf": 0.2734869269098656,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5004893617021277,
"calib/step_q_c_n": 376.0,
"calib/step_q_gap": 0.07177788629229159,
"calib/step_q_w": 0.42871147540983606,
"calib/step_q_w_n": 610.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1904.0,
"completions/max_terminated_length": 1904.0,
"completions/mean_length": 426.7890625,
"completions/mean_terminated_length": 430.14959716796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.03631528094410896,
"kl": 0.172576904296875,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.1333,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03998691961169243,
"mask/share_reasoning": 0.8527641296386719,
"mask/share_step_conf": 0.0994364470243454,
"num_tokens": 23085546.0,
"reward": 1.312831163406372,
"reward_std": 0.25012314319610596,
"rewards/accuracy_reward_step": 0.37890625,
"rewards/final_brier_reward_step": 0.7324117422103882,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.811859667301178,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.7160638570785522,
"adv/mean_abs_reasoning": 0.44066131114959717,
"adv/mean_abs_step_conf": 0.7667325139045715,
"adv/ratio_final_to_reasoning": 1.6249755514285675,
"adv/ratio_step_to_reasoning": 1.7399587722015346,
"adv/std_final_conf": 0.9027671813964844,
"adv/std_reasoning": 0.7013367414474487,
"adv/std_step_conf": 0.9360345602035522,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8072357981593145,
"calib/avg_num_step_conf": 3.8984375,
"calib/ece": 0.14190476190476184,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.31746031746031744,
"calib/gap": 0.30249444620755317,
"calib/mean_conf": 0.6661904761904762,
"calib/mu_c": 0.8042335766423359,
"calib/mu_w": 0.5017391304347827,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13222222222222219,
"calib/std_conf": 0.2820428796141521,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5499742,
"calib/step_q_c_n": 500.0,
"calib/step_q_gap": 0.13110873815261043,
"calib/step_q_w": 0.41886546184738954,
"calib/step_q_w_n": 498.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2643.0,
"completions/max_terminated_length": 2643.0,
"completions/mean_length": 366.84375,
"completions/mean_terminated_length": 368.2823791503906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.1088,
"grad_norm": 0.05497967079281807,
"kl": 0.1703948974609375,
"learning_rate": 2.7222222222222224e-06,
"loss": -0.0168,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04940368980169296,
"mask/share_reasoning": 0.826754093170166,
"mask/share_step_conf": 0.11993592977523804,
"num_tokens": 23286154.0,
"reward": 1.3887107372283936,
"reward_std": 0.19226235151290894,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7948195338249207,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8393477201461792,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.7382954359054565,
"adv/mean_abs_reasoning": 0.49512580037117004,
"adv/mean_abs_step_conf": 0.7493655681610107,
"adv/ratio_final_to_reasoning": 1.4911269728864762,
"adv/ratio_step_to_reasoning": 1.5134851942662861,
"adv/std_final_conf": 0.914150595664978,
"adv/std_reasoning": 0.7393211126327515,
"adv/std_step_conf": 0.9361206293106079,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7724887556221889,
"calib/avg_num_step_conf": 3.5859375,
"calib/ece": 0.13929133858267717,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2795275590551181,
"calib/gap": 0.2634657671164419,
"calib/mean_conf": 0.6550393700787401,
"calib/mu_c": 0.7753623188405797,
"calib/mu_w": 0.5118965517241378,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12551181102362202,
"calib/std_conf": 0.28208412777558095,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5474346895074946,
"calib/step_q_c_n": 467.0,
"calib/step_q_gap": 0.11523069837667421,
"calib/step_q_w": 0.4322039911308204,
"calib/step_q_w_n": 451.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2320.0,
"completions/max_terminated_length": 2320.0,
"completions/mean_length": 431.74609375,
"completions/mean_terminated_length": 433.4392395019531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.04851434379816055,
"kl": 0.2113494873046875,
"learning_rate": 2.6944444444444444e-06,
"loss": -0.0098,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04223305359482765,
"mask/share_reasoning": 0.8557662963867188,
"mask/share_step_conf": 0.09809436649084091,
"num_tokens": 23501233.0,
"reward": 1.3910441398620605,
"reward_std": 0.189153790473938,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.780501127243042,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8480590581893921,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.7384414076805115,
"adv/mean_abs_reasoning": 0.4301934540271759,
"adv/mean_abs_step_conf": 0.7679357528686523,
"adv/ratio_final_to_reasoning": 1.716533342773419,
"adv/ratio_step_to_reasoning": 1.785093998246056,
"adv/std_final_conf": 0.9183678030967712,
"adv/std_reasoning": 0.7013409733772278,
"adv/std_step_conf": 0.9360405206680298,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7520451889365017,
"calib/avg_num_step_conf": 3.78515625,
"calib/ece": 0.17993280632411068,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.233201581027668,
"calib/gap": 0.2726345279833789,
"calib/mean_conf": 0.5656245059288538,
"calib/mu_c": 0.728343137254902,
"calib/mu_w": 0.45570860927152307,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17119762845849804,
"calib/std_conf": 0.30125627166524066,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5265366972704715,
"calib/step_q_c_n": 403.0,
"calib/step_q_gap": 0.10257733331287433,
"calib/step_q_w": 0.42395936395759715,
"calib/step_q_w_n": 566.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2147.0,
"completions/max_terminated_length": 2147.0,
"completions/mean_length": 404.36328125,
"completions/mean_terminated_length": 404.36328125,
"completions/min_length": 139.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.04931806027889252,
"kl": 0.2790069580078125,
"learning_rate": 2.666666666666667e-06,
"loss": -0.0546,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.042280539870262146,
"mask/share_reasoning": 0.8500114679336548,
"mask/share_step_conf": 0.10770799219608307,
"num_tokens": 23711430.0,
"reward": 1.3486987352371216,
"reward_std": 0.19806914031505585,
"rewards/accuracy_reward_step": 0.3984375,
"rewards/final_brier_reward_step": 0.76436847448349,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8278425931930542,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.7237093448638916,
"adv/mean_abs_reasoning": 0.6229598522186279,
"adv/mean_abs_step_conf": 0.768699049949646,
"adv/ratio_final_to_reasoning": 1.1617271037394326,
"adv/ratio_step_to_reasoning": 1.2339463726466129,
"adv/std_final_conf": 0.9051206707954407,
"adv/std_reasoning": 0.8266770243644714,
"adv/std_step_conf": 0.9361938238143921,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7808970317835566,
"calib/avg_num_step_conf": 3.6796875,
"calib/ece": 0.22570281124497996,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.36947791164658633,
"calib/gap": 0.3384968479117414,
"calib/mean_conf": 0.6267469879518073,
"calib/mu_c": 0.8184259259259259,
"calib/mu_w": 0.4799290780141845,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.20935742971887553,
"calib/std_conf": 0.3418157578142417,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5456369426751593,
"calib/step_q_c_n": 314.0,
"calib/step_q_gap": 0.16186624203821665,
"calib/step_q_w": 0.38377070063694263,
"calib/step_q_w_n": 628.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2602.0,
"completions/max_terminated_length": 2602.0,
"completions/mean_length": 443.2578125,
"completions/mean_terminated_length": 443.2578125,
"completions/min_length": 114.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.112,
"grad_norm": 0.036655791103839874,
"kl": 0.152374267578125,
"learning_rate": 2.6388888888888893e-06,
"loss": 0.0456,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.043667759746313095,
"mask/share_reasoning": 0.8556262254714966,
"mask/share_step_conf": 0.1007060557603836,
"num_tokens": 23930664.0,
"reward": 1.3181248903274536,
"reward_std": 0.29982638359069824,
"rewards/accuracy_reward_step": 0.421875,
"rewards/final_brier_reward_step": 0.7417078018188477,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8082084059715271,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.6950488090515137,
"adv/mean_abs_reasoning": 0.45608022809028625,
"adv/mean_abs_step_conf": 0.7821485996246338,
"adv/ratio_final_to_reasoning": 1.523961720423278,
"adv/ratio_step_to_reasoning": 1.7149364332228816,
"adv/std_final_conf": 0.8908172249794006,
"adv/std_reasoning": 0.7205820679664612,
"adv/std_step_conf": 0.9362524747848511,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7576413732504864,
"calib/avg_num_step_conf": 3.62109375,
"calib/ece": 0.26952755905511816,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.43700787401574803,
"calib/gap": 0.2638636791564676,
"calib/mean_conf": 0.7144094488188976,
"calib/mu_c": 0.8608849557522124,
"calib/mu_w": 0.5970212765957448,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.26952755905511816,
"calib/std_conf": 0.2975057824973848,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5658398058252427,
"calib/step_q_c_n": 412.0,
"calib/step_q_gap": 0.10588252427184469,
"calib/step_q_w": 0.45995728155339805,
"calib/step_q_w_n": 515.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 938.0,
"completions/max_terminated_length": 938.0,
"completions/mean_length": 374.99609375,
"completions/mean_terminated_length": 376.4666748046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.06290726363658905,
"kl": 0.1614990234375,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.0864,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.042815275490283966,
"mask/share_reasoning": 0.8521543741226196,
"mask/share_step_conf": 0.10112406313419342,
"num_tokens": 24131247.0,
"reward": 1.3216792345046997,
"reward_std": 0.24554114043712616,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.7165695428848267,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8200350403785706,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.6705832481384277,
"adv/mean_abs_reasoning": 0.55158531665802,
"adv/mean_abs_step_conf": 0.746010422706604,
"adv/ratio_final_to_reasoning": 1.2157380334222816,
"adv/ratio_step_to_reasoning": 1.352484194515146,
"adv/std_final_conf": 0.8601038455963135,
"adv/std_reasoning": 0.7928495407104492,
"adv/std_step_conf": 0.9361860752105713,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6844349680170576,
"calib/avg_num_step_conf": 3.8671875,
"calib/ece": 0.2385375494071146,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.45849802371541504,
"calib/gap": 0.189776119402985,
"calib/mean_conf": 0.7405138339920949,
"calib/mu_c": 0.829776119402985,
"calib/mu_w": 0.64,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22470355731225294,
"calib/std_conf": 0.2901071984069233,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5264349442379181,
"calib/step_q_c_n": 538.0,
"calib/step_q_gap": 0.0806870506680733,
"calib/step_q_w": 0.44574789356984484,
"calib/step_q_w_n": 451.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2197.0,
"completions/max_terminated_length": 2197.0,
"completions/mean_length": 412.48046875,
"completions/mean_terminated_length": 412.48046875,
"completions/min_length": 137.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.04678349569439888,
"kl": 0.163360595703125,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.0671,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04249659553170204,
"mask/share_reasoning": 0.8509604930877686,
"mask/share_step_conf": 0.10654290020465851,
"num_tokens": 24341458.0,
"reward": 1.3123687505722046,
"reward_std": 0.2601582407951355,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7044988870620728,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8093380928039551,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.6462497711181641,
"adv/mean_abs_reasoning": 0.5497677326202393,
"adv/mean_abs_step_conf": 0.7494109869003296,
"adv/ratio_final_to_reasoning": 1.1754960008985673,
"adv/ratio_step_to_reasoning": 1.3631410911087374,
"adv/std_final_conf": 0.8757672309875488,
"adv/std_reasoning": 0.7927690744400024,
"adv/std_step_conf": 0.9362542033195496,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6808732486151841,
"calib/avg_num_step_conf": 4.328125,
"calib/ece": 0.17977165354330718,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5787401574803149,
"calib/gap": 0.21701987618116647,
"calib/mean_conf": 0.7645748031496062,
"calib/mu_c": 0.8491612903225807,
"calib/mu_w": 0.6321414141414142,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1670551181102363,
"calib/std_conf": 0.30523495583362753,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.513030959752322,
"calib/step_q_c_n": 646.0,
"calib/step_q_gap": 0.08474524546660778,
"calib/step_q_w": 0.42828571428571427,
"calib/step_q_w_n": 462.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1428.0,
"completions/max_terminated_length": 1428.0,
"completions/mean_length": 417.625,
"completions/mean_terminated_length": 417.625,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.1152,
"grad_norm": 0.03446570038795471,
"kl": 0.147552490234375,
"learning_rate": 2.5555555555555557e-06,
"loss": -0.027,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04210588335990906,
"mask/share_reasoning": 0.8456481695175171,
"mask/share_step_conf": 0.11224594712257385,
"num_tokens": 24551602.0,
"reward": 1.358659029006958,
"reward_std": 0.24917784333229065,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7425518035888672,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8272268772125244,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.6396289467811584,
"adv/mean_abs_reasoning": 0.4717791676521301,
"adv/mean_abs_step_conf": 0.7523552179336548,
"adv/ratio_final_to_reasoning": 1.355780396078857,
"adv/ratio_step_to_reasoning": 1.5947190328005527,
"adv/std_final_conf": 0.8753486275672913,
"adv/std_reasoning": 0.7574488520622253,
"adv/std_step_conf": 0.9360347986221313,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.8373565492679066,
"calib/avg_num_step_conf": 4.4296875,
"calib/ece": 0.18340080971659917,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.4008097165991903,
"calib/gap": 0.4124436090225562,
"calib/mean_conf": 0.6397570850202429,
"calib/mu_c": 0.8618421052631577,
"calib/mu_w": 0.4493984962406015,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18080971659919026,
"calib/std_conf": 0.34836800208824037,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.530151966873706,
"calib/step_q_c_n": 483.0,
"calib/step_q_gap": 0.18351003138983502,
"calib/step_q_w": 0.346641935483871,
"calib/step_q_w_n": 651.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2576.0,
"completions/max_terminated_length": 2576.0,
"completions/mean_length": 461.74609375,
"completions/mean_terminated_length": 461.74609375,
"completions/min_length": 121.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.04146302118897438,
"kl": 0.151702880859375,
"learning_rate": 2.5277777777777778e-06,
"loss": -0.0039,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03965022414922714,
"mask/share_reasoning": 0.8478109836578369,
"mask/share_step_conf": 0.11253875494003296,
"num_tokens": 24774409.0,
"reward": 1.3690853118896484,
"reward_std": 0.26065385341644287,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.7751156091690063,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8405119180679321,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.7240575551986694,
"adv/mean_abs_reasoning": 0.5572279691696167,
"adv/mean_abs_step_conf": 0.7480405569076538,
"adv/ratio_final_to_reasoning": 1.2993919818448145,
"adv/ratio_step_to_reasoning": 1.3424318201801442,
"adv/std_final_conf": 0.9065413475036621,
"adv/std_reasoning": 0.7928216457366943,
"adv/std_step_conf": 0.9361799955368042,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7071007811499552,
"calib/avg_num_step_conf": 3.66796875,
"calib/ece": 0.23066533864541833,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4262948207171315,
"calib/gap": 0.23882622614931492,
"calib/mean_conf": 0.6706095617529879,
"calib/mu_c": 0.8009649122807018,
"calib/mu_w": 0.5621386861313868,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22354581673306773,
"calib/std_conf": 0.318844875651609,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5426075949367088,
"calib/step_q_c_n": 395.0,
"calib/step_q_gap": 0.13413296258376772,
"calib/step_q_w": 0.4084746323529411,
"calib/step_q_w_n": 544.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1917.0,
"completions/max_terminated_length": 1917.0,
"completions/mean_length": 385.34375,
"completions/mean_terminated_length": 388.3779602050781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.06782060861587524,
"kl": 0.172576904296875,
"learning_rate": 2.5e-06,
"loss": -0.0693,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.04428033158183098,
"mask/share_reasoning": 0.8394521474838257,
"mask/share_step_conf": 0.10845498740673065,
"num_tokens": 24977977.0,
"reward": 1.3049430847167969,
"reward_std": 0.24617999792099,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.7079055309295654,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8084121942520142,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.6536703109741211,
"adv/mean_abs_reasoning": 0.5757176876068115,
"adv/mean_abs_step_conf": 0.776113748550415,
"adv/ratio_final_to_reasoning": 1.1354007789674643,
"adv/ratio_step_to_reasoning": 1.3480804311860308,
"adv/std_final_conf": 0.8580630421638489,
"adv/std_reasoning": 0.7928107976913452,
"adv/std_step_conf": 0.9360227584838867,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7613636363636365,
"calib/avg_num_step_conf": 3.640625,
"calib/ece": 0.23164705882352937,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5254901960784314,
"calib/gap": 0.3368699186991869,
"calib/mean_conf": 0.7008235294117646,
"calib/mu_c": 0.8752032520325203,
"calib/mu_w": 0.5383333333333333,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2250588235294117,
"calib/std_conf": 0.3455335576269934,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.572342857142857,
"calib/step_q_c_n": 385.0,
"calib/step_q_gap": 0.19892055366936529,
"calib/step_q_w": 0.37342230347349176,
"calib/step_q_w_n": 547.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2741.0,
"completions/max_terminated_length": 2741.0,
"completions/mean_length": 406.5625,
"completions/mean_terminated_length": 406.5625,
"completions/min_length": 90.0,
"completions/min_terminated_length": 90.0,
"epoch": 0.1184,
"grad_norm": 0.04310350865125656,
"kl": 0.1740875244140625,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.0381,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04492507129907608,
"mask/share_reasoning": 0.8509223461151123,
"mask/share_step_conf": 0.10415257513523102,
"num_tokens": 25189465.0,
"reward": 1.3606820106506348,
"reward_std": 0.22834959626197815,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.7484785318374634,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8387864828109741,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.6845964193344116,
"adv/mean_abs_reasoning": 0.5019567012786865,
"adv/mean_abs_step_conf": 0.7608423233032227,
"adv/ratio_final_to_reasoning": 1.3638555229773164,
"adv/ratio_step_to_reasoning": 1.5157528953494392,
"adv/std_final_conf": 0.8892654180526733,
"adv/std_reasoning": 0.757592499256134,
"adv/std_step_conf": 0.935874879360199,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7720385674931128,
"calib/avg_num_step_conf": 3.66015625,
"calib/ece": 0.12848484848484854,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.3359683794466403,
"calib/gap": 0.37050045913682284,
"calib/mean_conf": 0.533965744400527,
"calib/mu_c": 0.7111616161616162,
"calib/mu_w": 0.3406611570247934,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.07035573122529648,
"calib/std_conf": 0.3735869229363475,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4601897233201581,
"calib/step_q_c_n": 506.0,
"calib/step_q_gap": 0.1314147813247985,
"calib/step_q_w": 0.3287749419953596,
"calib/step_q_w_n": 431.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2475.0,
"completions/max_terminated_length": 2475.0,
"completions/mean_length": 439.5234375,
"completions/mean_terminated_length": 439.5234375,
"completions/min_length": 136.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.041784606873989105,
"kl": 0.1550445556640625,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.117,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03895264118909836,
"mask/share_reasoning": 0.8648859262466431,
"mask/share_step_conf": 0.09616147726774216,
"num_tokens": 25409903.0,
"reward": 1.3797184228897095,
"reward_std": 0.21610228717327118,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7841790914535522,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8376288414001465,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.7069253921508789,
"adv/mean_abs_reasoning": 0.6055176258087158,
"adv/mean_abs_step_conf": 0.7630271315574646,
"adv/ratio_final_to_reasoning": 1.1674728563131835,
"adv/ratio_step_to_reasoning": 1.2601237338688245,
"adv/std_final_conf": 0.8900803923606873,
"adv/std_reasoning": 0.8265026211738586,
"adv/std_step_conf": 0.9358813166618347,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7211304510574585,
"calib/avg_num_step_conf": 4.26953125,
"calib/ece": 0.21665354330708658,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.4645669291338583,
"calib/gap": 0.2851325722128642,
"calib/mean_conf": 0.6341338582677165,
"calib/mu_c": 0.7654744525547446,
"calib/mu_w": 0.48034188034188036,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1557086614173228,
"calib/std_conf": 0.37022635898206085,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.45904504504504506,
"calib/step_q_c_n": 555.0,
"calib/step_q_gap": 0.13076623463612314,
"calib/step_q_w": 0.32827881040892193,
"calib/step_q_w_n": 538.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1674.0,
"completions/max_terminated_length": 1674.0,
"completions/mean_length": 374.02734375,
"completions/mean_terminated_length": 375.494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.0556560680270195,
"kl": 0.211578369140625,
"learning_rate": 2.4166666666666667e-06,
"loss": -0.008,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.044888682663440704,
"mask/share_reasoning": 0.8306836485862732,
"mask/share_step_conf": 0.1205214336514473,
"num_tokens": 25610854.0,
"reward": 1.3818920850753784,
"reward_std": 0.2296406775712967,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7413472533226013,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8584840297698975,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.7044678926467896,
"adv/mean_abs_reasoning": 0.5276539325714111,
"adv/mean_abs_step_conf": 0.7533060312271118,
"adv/ratio_final_to_reasoning": 1.3350945556564178,
"adv/ratio_step_to_reasoning": 1.4276516950343425,
"adv/std_final_conf": 0.9050940275192261,
"adv/std_reasoning": 0.7926760315895081,
"adv/std_step_conf": 0.9359117746353149,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7912844036697247,
"calib/avg_num_step_conf": 3.875,
"calib/ece": 0.13740784313725485,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.4823529411764706,
"calib/gap": 0.4216498680407189,
"calib/mean_conf": 0.6370941176470588,
"calib/mu_c": 0.8173287671232876,
"calib/mu_w": 0.3956788990825687,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10097647058823525,
"calib/std_conf": 0.3869714877330616,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4834704830053667,
"calib/step_q_c_n": 559.0,
"calib/step_q_gap": 0.14740350840952376,
"calib/step_q_w": 0.33606697459584295,
"calib/step_q_w_n": 433.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1745.0,
"completions/max_terminated_length": 1745.0,
"completions/mean_length": 386.40234375,
"completions/mean_terminated_length": 386.40234375,
"completions/min_length": 99.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.1216,
"grad_norm": 0.04214748367667198,
"kl": 0.2071685791015625,
"learning_rate": 2.388888888888889e-06,
"loss": 0.025,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04536930471658707,
"mask/share_reasoning": 0.8384172916412354,
"mask/share_step_conf": 0.1162133663892746,
"num_tokens": 25814797.0,
"reward": 1.4204580783843994,
"reward_std": 0.22163823246955872,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.8045815229415894,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8615266680717468,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.6950163841247559,
"adv/mean_abs_reasoning": 0.5868062973022461,
"adv/mean_abs_step_conf": 0.7511539459228516,
"adv/ratio_final_to_reasoning": 1.1844051219627147,
"adv/ratio_step_to_reasoning": 1.2800713785386577,
"adv/std_final_conf": 0.8760892748832703,
"adv/std_reasoning": 0.8098748326301575,
"adv/std_step_conf": 0.9364647269248962,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6829407294832828,
"calib/avg_num_step_conf": 3.95703125,
"calib/ece": 0.2199565217391305,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.33992094861660077,
"calib/gap": 0.24440400202634244,
"calib/mean_conf": 0.5407549407114625,
"calib/mu_c": 0.6769642857142857,
"calib/mu_w": 0.43256028368794325,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15901185770750992,
"calib/std_conf": 0.3805655080979492,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4071922246220302,
"calib/step_q_c_n": 463.0,
"calib/step_q_gap": 0.07122858825839379,
"calib/step_q_w": 0.3359636363636364,
"calib/step_q_w_n": 550.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2701.0,
"completions/max_terminated_length": 2701.0,
"completions/mean_length": 417.3046875,
"completions/mean_terminated_length": 417.3046875,
"completions/min_length": 91.0,
"completions/min_terminated_length": 91.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.041259538382291794,
"kl": 0.1705780029296875,
"learning_rate": 2.361111111111111e-06,
"loss": -0.0094,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04276685416698456,
"mask/share_reasoning": 0.8451552987098694,
"mask/share_step_conf": 0.11207783222198486,
"num_tokens": 26026891.0,
"reward": 1.3043255805969238,
"reward_std": 0.2694549262523651,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.7081127166748047,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8080816864967346,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.7447359561920166,
"adv/mean_abs_reasoning": 0.5281481742858887,
"adv/mean_abs_step_conf": 0.7545117139816284,
"adv/ratio_final_to_reasoning": 1.410089047830142,
"adv/ratio_step_to_reasoning": 1.4285985462352622,
"adv/std_final_conf": 0.9046410918235779,
"adv/std_reasoning": 0.7576063871383667,
"adv/std_step_conf": 0.9354843497276306,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.681582238899312,
"calib/avg_num_step_conf": 4.23828125,
"calib/ece": 0.22569169960474306,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3438735177865613,
"calib/gap": 0.26624640400250144,
"calib/mean_conf": 0.5020553359683795,
"calib/mu_c": 0.638861788617886,
"calib/mu_w": 0.37261538461538457,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12079051383399207,
"calib/std_conf": 0.39732470580007523,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4007942477876107,
"calib/step_q_c_n": 452.0,
"calib/step_q_gap": 0.12556107243216047,
"calib/step_q_w": 0.2752331753554502,
"calib/step_q_w_n": 633.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2622.0,
"completions/max_terminated_length": 2622.0,
"completions/mean_length": 443.2265625,
"completions/mean_terminated_length": 444.9647216796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.06766902655363083,
"kl": 0.162445068359375,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.0187,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04233179986476898,
"mask/share_reasoning": 0.8447202444076538,
"mask/share_step_conf": 0.1090417355298996,
"num_tokens": 26244877.0,
"reward": 1.3489094972610474,
"reward_std": 0.23857152462005615,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.7165961265563965,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8437364101409912,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.6993064880371094,
"adv/mean_abs_reasoning": 0.5325876474380493,
"adv/mean_abs_step_conf": 0.7361884117126465,
"adv/ratio_final_to_reasoning": 1.3130355001679848,
"adv/ratio_step_to_reasoning": 1.382285930313996,
"adv/std_final_conf": 0.8909405469894409,
"adv/std_reasoning": 0.7577109336853027,
"adv/std_step_conf": 0.9362501502037048,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6957074175824175,
"calib/avg_num_step_conf": 4.17578125,
"calib/ece": 0.20776892430278887,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.32669322709163345,
"calib/gap": 0.27219711538461544,
"calib/mean_conf": 0.5018725099601594,
"calib/mu_c": 0.6753846153846154,
"calib/mu_w": 0.4031874999999999,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17354581673306774,
"calib/std_conf": 0.3831293869864129,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.39309,
"calib/step_q_c_n": 380.0,
"calib/step_q_gap": 0.07801031930333818,
"calib/step_q_w": 0.3150796806966618,
"calib/step_q_w_n": 689.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2724.0,
"completions/max_terminated_length": 2724.0,
"completions/mean_length": 427.484375,
"completions/mean_terminated_length": 429.1607971191406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.1248,
"grad_norm": 0.06574005633592606,
"kl": 0.1655731201171875,
"learning_rate": 2.305555555555556e-06,
"loss": -0.0788,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03895164281129837,
"mask/share_reasoning": 0.8477646708488464,
"mask/share_step_conf": 0.10937744379043579,
"num_tokens": 26460913.0,
"reward": 1.3173385858535767,
"reward_std": 0.2695554792881012,
"rewards/accuracy_reward_step": 0.35546875,
"rewards/final_brier_reward_step": 0.714278519153595,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8266055583953857,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.7023355960845947,
"adv/mean_abs_reasoning": 0.5506028532981873,
"adv/mean_abs_step_conf": 0.7614545226097107,
"adv/ratio_final_to_reasoning": 1.2755756565326666,
"adv/ratio_step_to_reasoning": 1.382946924536429,
"adv/std_final_conf": 0.9201270937919617,
"adv/std_reasoning": 0.8097826242446899,
"adv/std_step_conf": 0.9361098408699036,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.725543652575534,
"calib/avg_num_step_conf": 4.921875,
"calib/ece": 0.18125199999999997,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.352,
"calib/gap": 0.3181899416255052,
"calib/mean_conf": 0.520908,
"calib/mu_c": 0.6723664122137405,
"calib/mu_w": 0.3541764705882353,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.08908,
"calib/std_conf": 0.38569740929386603,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.3985378787878788,
"calib/step_q_c_n": 528.0,
"calib/step_q_gap": 0.1381376055638351,
"calib/step_q_w": 0.2604002732240437,
"calib/step_q_w_n": 732.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2721.0,
"completions/max_terminated_length": 2721.0,
"completions/mean_length": 419.71875,
"completions/mean_terminated_length": 421.3647155761719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.04007981717586517,
"kl": 0.1764984130859375,
"learning_rate": 2.277777777777778e-06,
"loss": 0.0011,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.041666384786367416,
"mask/share_reasoning": 0.8308029174804688,
"mask/share_step_conf": 0.12362450361251831,
"num_tokens": 26672369.0,
"reward": 1.3235828876495361,
"reward_std": 0.2537011206150055,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.742707371711731,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.803400993347168,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.7023715972900391,
"adv/mean_abs_reasoning": 0.6352405548095703,
"adv/mean_abs_step_conf": 0.7528685331344604,
"adv/ratio_final_to_reasoning": 1.1056781434563683,
"adv/ratio_step_to_reasoning": 1.1851707631609762,
"adv/std_final_conf": 0.8904647827148438,
"adv/std_reasoning": 0.8429591655731201,
"adv/std_step_conf": 0.9359723329544067,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.762542330364982,
"calib/avg_num_step_conf": 4.2578125,
"calib/ece": 0.15938735177865615,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.28063241106719367,
"calib/gap": 0.3792264517747398,
"calib/mean_conf": 0.419901185770751,
"calib/mu_c": 0.6207563025210084,
"calib/mu_w": 0.24152985074626868,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.05446640316205535,
"calib/std_conf": 0.3928200592273774,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.372380658436214,
"calib/step_q_c_n": 486.0,
"calib/step_q_gap": 0.13757193327065104,
"calib/step_q_w": 0.23480872516556295,
"calib/step_q_w_n": 604.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2203.0,
"completions/max_terminated_length": 2203.0,
"completions/mean_length": 449.1640625,
"completions/mean_terminated_length": 450.9255065917969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.050807878375053406,
"kl": 0.172210693359375,
"learning_rate": 2.25e-06,
"loss": -0.0699,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04051949828863144,
"mask/share_reasoning": 0.8514037132263184,
"mask/share_step_conf": 0.10417056083679199,
"num_tokens": 26892419.0,
"reward": 1.3865572214126587,
"reward_std": 0.23146747052669525,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.773796796798706,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8543463945388794,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.7236967086791992,
"adv/mean_abs_reasoning": 0.5814656019210815,
"adv/mean_abs_step_conf": 0.7500573396682739,
"adv/ratio_final_to_reasoning": 1.2446079463483408,
"adv/ratio_step_to_reasoning": 1.289942753604321,
"adv/std_final_conf": 0.9180964231491089,
"adv/std_reasoning": 0.8266046643257141,
"adv/std_step_conf": 0.936331570148468,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7371337827326938,
"calib/avg_num_step_conf": 4.2109375,
"calib/ece": 0.2342570281124498,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.24497991967871485,
"calib/gap": 0.31392792325641694,
"calib/mean_conf": 0.38871485943775097,
"calib/mu_c": 0.5349624060150376,
"calib/mu_w": 0.22103448275862067,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.04441767068273092,
"calib/std_conf": 0.3788589333586088,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.3636018691588785,
"calib/step_q_c_n": 535.0,
"calib/step_q_gap": 0.1408436739470921,
"calib/step_q_w": 0.2227581952117864,
"calib/step_q_w_n": 543.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2790.0,
"completions/max_terminated_length": 2790.0,
"completions/mean_length": 413.84765625,
"completions/mean_terminated_length": 417.1062927246094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.128,
"grad_norm": 0.09071393311023712,
"kl": 0.1795806884765625,
"learning_rate": 2.222222222222222e-06,
"loss": -0.0262,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.040145181119441986,
"mask/share_reasoning": 0.8437585234642029,
"mask/share_step_conf": 0.10828377306461334,
"num_tokens": 27105052.0,
"reward": 1.322953224182129,
"reward_std": 0.26806381344795227,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7185671925544739,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8148413896560669,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.7919062376022339,
"adv/mean_abs_reasoning": 0.5846736431121826,
"adv/mean_abs_step_conf": 0.7712915539741516,
"adv/ratio_final_to_reasoning": 1.3544414853164317,
"adv/ratio_step_to_reasoning": 1.3191830400779023,
"adv/std_final_conf": 0.934366762638092,
"adv/std_reasoning": 0.7929185628890991,
"adv/std_step_conf": 0.9359788298606873,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6865609880315762,
"calib/avg_num_step_conf": 4.87890625,
"calib/ece": 0.25900398406374503,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.26294820717131473,
"calib/gap": 0.22867583396995161,
"calib/mean_conf": 0.4162948207171315,
"calib/mu_c": 0.5365546218487395,
"calib/mu_w": 0.30787878787878786,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.100597609561753,
"calib/std_conf": 0.3855396909784887,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.34495750487329435,
"calib/step_q_c_n": 513.0,
"calib/step_q_gap": 0.09719921682981605,
"calib/step_q_w": 0.2477582880434783,
"calib/step_q_w_n": 736.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2834.0,
"completions/max_terminated_length": 2834.0,
"completions/mean_length": 469.578125,
"completions/mean_terminated_length": 473.27557373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.044260066002607346,
"kl": 0.1622772216796875,
"learning_rate": 2.1944444444444445e-06,
"loss": -0.0574,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.037188708782196045,
"mask/share_reasoning": 0.8464273810386658,
"mask/share_step_conf": 0.10857141017913818,
"num_tokens": 27330320.0,
"reward": 1.314887523651123,
"reward_std": 0.2514931559562683,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.6987988352775574,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.820956826210022,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.6651676297187805,
"adv/mean_abs_reasoning": 0.4964384436607361,
"adv/mean_abs_step_conf": 0.7753804922103882,
"adv/ratio_final_to_reasoning": 1.3398793711740689,
"adv/ratio_step_to_reasoning": 1.561886477793166,
"adv/std_final_conf": 0.868554413318634,
"adv/std_reasoning": 0.7753000855445862,
"adv/std_step_conf": 0.9360112547874451,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.810948191593353,
"calib/avg_num_step_conf": 4.41015625,
"calib/ece": 0.15142913385826773,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.4917824046920821,
"calib/mean_conf": 0.5998307086614173,
"calib/mu_c": 0.7915096774193549,
"calib/mu_w": 0.29972727272727273,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.07051181102362206,
"calib/std_conf": 0.42493590864806063,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.41932664756446997,
"calib/step_q_c_n": 698.0,
"calib/step_q_gap": 0.17259439698442355,
"calib/step_q_w": 0.2467322505800464,
"calib/step_q_w_n": 431.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1969.0,
"completions/max_terminated_length": 1969.0,
"completions/mean_length": 394.44921875,
"completions/mean_terminated_length": 395.99609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.04509063810110092,
"kl": 0.1987762451171875,
"learning_rate": 2.166666666666667e-06,
"loss": -0.0169,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04127652570605278,
"mask/share_reasoning": 0.8373570442199707,
"mask/share_step_conf": 0.11746013164520264,
"num_tokens": 27538643.0,
"reward": 1.4087672233581543,
"reward_std": 0.2185821235179901,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.8090417385101318,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8444807529449463,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.7057344913482666,
"adv/mean_abs_reasoning": 0.5242763757705688,
"adv/mean_abs_step_conf": 0.7717443108558655,
"adv/ratio_final_to_reasoning": 1.346111562457864,
"adv/ratio_step_to_reasoning": 1.4720180929792501,
"adv/std_final_conf": 0.8740495443344116,
"adv/std_reasoning": 0.7575638890266418,
"adv/std_step_conf": 0.9360249042510986,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.682103952692188,
"calib/avg_num_step_conf": 4.42578125,
"calib/ece": 0.23219763779527566,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.2992125984251969,
"calib/gap": 0.2641871023965142,
"calib/mean_conf": 0.42252677165354335,
"calib/mu_c": 0.5629411764705883,
"calib/mu_w": 0.29875407407407406,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09311023622047249,
"calib/std_conf": 0.39652975758870523,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3490301075268817,
"calib/step_q_c_n": 465.0,
"calib/step_q_gap": 0.12006304165861822,
"calib/step_q_w": 0.22896706586826346,
"calib/step_q_w_n": 668.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2044.0,
"completions/max_terminated_length": 2044.0,
"completions/mean_length": 460.00390625,
"completions/mean_terminated_length": 461.807861328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.1312,
"grad_norm": 0.04096907004714012,
"kl": 0.1746368408203125,
"learning_rate": 2.138888888888889e-06,
"loss": 0.012,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03718689829111099,
"mask/share_reasoning": 0.8577108979225159,
"mask/share_step_conf": 0.10119592398405075,
"num_tokens": 27761692.0,
"reward": 1.3273255825042725,
"reward_std": 0.21498894691467285,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.7175615429878235,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8228417634963989,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.698214054107666,
"adv/mean_abs_reasoning": 0.4900742769241333,
"adv/mean_abs_step_conf": 0.7525066137313843,
"adv/ratio_final_to_reasoning": 1.4247106754712493,
"adv/ratio_step_to_reasoning": 1.5354950242529812,
"adv/std_final_conf": 0.9064407348632812,
"adv/std_reasoning": 0.739323079586029,
"adv/std_step_conf": 0.9363665580749512,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7348370927318297,
"calib/avg_num_step_conf": 4.68359375,
"calib/ece": 0.21602362204724407,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.4015748031496063,
"calib/gap": 0.3618107769423559,
"calib/mean_conf": 0.5108267716535433,
"calib/mu_c": 0.6732142857142858,
"calib/mu_w": 0.31140350877192985,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08783464566929136,
"calib/std_conf": 0.42134919392385983,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3774491803278689,
"calib/step_q_c_n": 610.0,
"calib/step_q_gap": 0.11135206657574664,
"calib/step_q_w": 0.26609711375212225,
"calib/step_q_w_n": 589.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2324.0,
"completions/max_terminated_length": 2324.0,
"completions/mean_length": 432.83203125,
"completions/mean_terminated_length": 432.83203125,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.12796330451965332,
"kl": 0.193145751953125,
"learning_rate": 2.1111111111111114e-06,
"loss": 0.0172,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03850656747817993,
"mask/share_reasoning": 0.8427542448043823,
"mask/share_step_conf": 0.11873914301395416,
"num_tokens": 27979313.0,
"reward": 1.3393280506134033,
"reward_std": 0.24536257982254028,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7465871572494507,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8121283054351807,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.6791626214981079,
"adv/mean_abs_reasoning": 0.5428112149238586,
"adv/mean_abs_step_conf": 0.7677067518234253,
"adv/ratio_final_to_reasoning": 1.251194895804383,
"adv/ratio_step_to_reasoning": 1.4143163050363896,
"adv/std_final_conf": 0.8601999878883362,
"adv/std_reasoning": 0.7754709720611572,
"adv/std_step_conf": 0.9360963106155396,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.669949494949495,
"calib/avg_num_step_conf": 4.05859375,
"calib/ece": 0.2620238095238096,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.43253968253968256,
"calib/gap": 0.2544166666666667,
"calib/mean_conf": 0.5611507936507937,
"calib/mu_c": 0.6944166666666667,
"calib/mu_w": 0.44,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17349206349206356,
"calib/std_conf": 0.41145196960644254,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.39113207547169815,
"calib/step_q_c_n": 477.0,
"calib/step_q_gap": 0.0631391929094206,
"calib/step_q_w": 0.32799288256227754,
"calib/step_q_w_n": 562.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2193.0,
"completions/max_terminated_length": 2193.0,
"completions/mean_length": 462.08984375,
"completions/mean_terminated_length": 462.08984375,
"completions/min_length": 97.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.04287952929735184,
"kl": 0.1557159423828125,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.0341,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.040689751505851746,
"mask/share_reasoning": 0.8555436730384827,
"mask/share_step_conf": 0.1037665456533432,
"num_tokens": 28202416.0,
"reward": 1.2918962240219116,
"reward_std": 0.27244269847869873,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.6861358880996704,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8039063215255737,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.6152515411376953,
"adv/mean_abs_reasoning": 0.4486311078071594,
"adv/mean_abs_step_conf": 0.7678678035736084,
"adv/ratio_final_to_reasoning": 1.3713974141136829,
"adv/ratio_step_to_reasoning": 1.7115794919501444,
"adv/std_final_conf": 0.8269786834716797,
"adv/std_reasoning": 0.7206025123596191,
"adv/std_step_conf": 0.9352603554725647,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7698147195059187,
"calib/avg_num_step_conf": 4.8359375,
"calib/ece": 0.195924,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.528,
"calib/gap": 0.4379417138445702,
"calib/mean_conf": 0.6099959999999999,
"calib/mu_c": 0.8447327586206896,
"calib/mu_w": 0.40679104477611944,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17096,
"calib/std_conf": 0.4277949087869093,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.47163960396039606,
"calib/step_q_c_n": 505.0,
"calib/step_q_gap": 0.21580058622506182,
"calib/step_q_w": 0.25583901773533424,
"calib/step_q_w_n": 733.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2685.0,
"completions/max_terminated_length": 2685.0,
"completions/mean_length": 471.8671875,
"completions/mean_terminated_length": 473.7176818847656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 68.0,
"epoch": 0.1344,
"grad_norm": 0.0364367812871933,
"kl": 0.1450653076171875,
"learning_rate": 2.0555555555555555e-06,
"loss": 0.0102,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.0397508405148983,
"mask/share_reasoning": 0.846636176109314,
"mask/share_step_conf": 0.10970672965049744,
"num_tokens": 28428678.0,
"reward": 1.342126488685608,
"reward_std": 0.258208304643631,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.74688321352005,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8261067271232605,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.6154482364654541,
"adv/mean_abs_reasoning": 0.41630616784095764,
"adv/mean_abs_step_conf": 0.7662066221237183,
"adv/ratio_final_to_reasoning": 1.478354835954713,
"adv/ratio_step_to_reasoning": 1.840488278368323,
"adv/std_final_conf": 0.825966477394104,
"adv/std_reasoning": 0.7014350295066833,
"adv/std_step_conf": 0.9359336495399475,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7410403103833761,
"calib/avg_num_step_conf": 4.83203125,
"calib/ece": 0.2454435483870968,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5645161290322581,
"calib/gap": 0.3928749917801011,
"calib/mean_conf": 0.6350403225806452,
"calib/mu_c": 0.8520720720720719,
"calib/mu_w": 0.4591970802919708,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21645161290322584,
"calib/std_conf": 0.43111726380218535,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4743393574297189,
"calib/step_q_c_n": 498.0,
"calib/step_q_gap": 0.1798232463336431,
"calib/step_q_w": 0.2945161110960758,
"calib/step_q_w_n": 739.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2630.0,
"completions/max_terminated_length": 2630.0,
"completions/mean_length": 450.75,
"completions/mean_terminated_length": 454.2992248535156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.06047916039824486,
"kl": 0.1569366455078125,
"learning_rate": 2.027777777777778e-06,
"loss": -0.0308,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.040085144340991974,
"mask/share_reasoning": 0.8332340717315674,
"mask/share_step_conf": 0.11886821687221527,
"num_tokens": 28647742.0,
"reward": 1.3079798221588135,
"reward_std": 0.2906861901283264,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.6997348070144653,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8182685375213623,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.7099267244338989,
"adv/mean_abs_reasoning": 0.5674612522125244,
"adv/mean_abs_step_conf": 0.738547682762146,
"adv/ratio_final_to_reasoning": 1.251057621407459,
"adv/ratio_step_to_reasoning": 1.301494471882543,
"adv/std_final_conf": 0.8756265640258789,
"adv/std_reasoning": 0.7929064035415649,
"adv/std_step_conf": 0.9359824061393738,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7090269712220932,
"calib/avg_num_step_conf": 4.0859375,
"calib/ece": 0.2501204819277109,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5742971887550201,
"calib/gap": 0.35529229578010063,
"calib/mean_conf": 0.6424899598393575,
"calib/mu_c": 0.8222764227642276,
"calib/mu_w": 0.466984126984127,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1993172690763053,
"calib/std_conf": 0.42755330922413926,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.45157582417582415,
"calib/step_q_c_n": 455.0,
"calib/step_q_gap": 0.15030002045331992,
"calib/step_q_w": 0.30127580372250423,
"calib/step_q_w_n": 591.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2367.0,
"completions/max_terminated_length": 2367.0,
"completions/mean_length": 444.80078125,
"completions/mean_terminated_length": 451.86114501953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.047435879707336426,
"kl": 0.160675048828125,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0491,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03938092291355133,
"mask/share_reasoning": 0.8430782556533813,
"mask/share_step_conf": 0.10191580653190613,
"num_tokens": 28868275.0,
"reward": 1.3003993034362793,
"reward_std": 0.2986387610435486,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.7030344009399414,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8035696744918823,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.6892992258071899,
"adv/mean_abs_reasoning": 0.6112114191055298,
"adv/mean_abs_step_conf": 0.7431253790855408,
"adv/ratio_final_to_reasoning": 1.1277590769098143,
"adv/ratio_step_to_reasoning": 1.2158237818479551,
"adv/std_final_conf": 0.8707001209259033,
"adv/std_reasoning": 0.8428522348403931,
"adv/std_step_conf": 0.9362640976905823,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6386879837584063,
"calib/avg_num_step_conf": 4.4375,
"calib/ece": 0.2934387351778656,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6758893280632411,
"calib/gap": 0.22017954574292598,
"calib/mean_conf": 0.7472727272727273,
"calib/mu_c": 0.8438732394366197,
"calib/mu_w": 0.6236936936936938,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23972332015810277,
"calib/std_conf": 0.3805302010619262,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4970400843881857,
"calib/step_q_c_n": 632.0,
"calib/step_q_gap": 0.09559167168977295,
"calib/step_q_w": 0.40144841269841275,
"calib/step_q_w_n": 504.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1354.0,
"completions/max_terminated_length": 1354.0,
"completions/mean_length": 407.484375,
"completions/mean_terminated_length": 409.0823669433594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.1376,
"grad_norm": 0.06957102566957474,
"kl": 0.1775360107421875,
"learning_rate": 1.9722222222222224e-06,
"loss": 0.004,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.040567055344581604,
"mask/share_reasoning": 0.8331802487373352,
"mask/share_step_conf": 0.12234644591808319,
"num_tokens": 29074975.0,
"reward": 1.3078129291534424,
"reward_std": 0.2828935980796814,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6708804368972778,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8184664249420166,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.576531708240509,
"adv/mean_abs_reasoning": 0.4125916063785553,
"adv/mean_abs_step_conf": 0.7666875123977661,
"adv/ratio_final_to_reasoning": 1.3973423097500866,
"adv/ratio_step_to_reasoning": 1.8582237266705945,
"adv/std_final_conf": 0.7933287620544434,
"adv/std_reasoning": 0.6816454529762268,
"adv/std_step_conf": 0.9361461400985718,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7252351097178683,
"calib/avg_num_step_conf": 4.16015625,
"calib/ece": 0.23817647058823524,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6745098039215687,
"calib/gap": 0.3667382445141065,
"calib/mean_conf": 0.7327647058823529,
"calib/mu_c": 0.8909655172413793,
"calib/mu_w": 0.5242272727272728,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.20115686274509798,
"calib/std_conf": 0.39607969034938717,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5254081128747796,
"calib/step_q_c_n": 567.0,
"calib/step_q_gap": 0.14184272052065489,
"calib/step_q_w": 0.38356539235412473,
"calib/step_q_w_n": 497.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1841.0,
"completions/max_terminated_length": 1841.0,
"completions/mean_length": 386.44140625,
"completions/mean_terminated_length": 386.44140625,
"completions/min_length": 74.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.049915507435798645,
"kl": 0.1814117431640625,
"learning_rate": 1.944444444444445e-06,
"loss": 0.0389,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.043779339641332626,
"mask/share_reasoning": 0.8376109600067139,
"mask/share_step_conf": 0.1186097040772438,
"num_tokens": 29279192.0,
"reward": 1.3503212928771973,
"reward_std": 0.24141067266464233,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7408499121665955,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8244274854660034,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.6712453365325928,
"adv/mean_abs_reasoning": 0.35178640484809875,
"adv/mean_abs_step_conf": 0.7708390951156616,
"adv/ratio_final_to_reasoning": 1.9081048252061823,
"adv/ratio_step_to_reasoning": 2.1912134309127427,
"adv/std_final_conf": 0.8527726531028748,
"adv/std_reasoning": 0.6401533484458923,
"adv/std_step_conf": 0.935929000377655,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7221153846153847,
"calib/avg_num_step_conf": 4.1484375,
"calib/ece": 0.25347656249999995,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.51953125,
"calib/gap": 0.393525641025641,
"calib/mean_conf": 0.5926953125,
"calib/mu_c": 0.8325,
"calib/mu_w": 0.438974358974359,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22777343749999995,
"calib/std_conf": 0.435085481159194,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5067487046632124,
"calib/step_q_c_n": 386.0,
"calib/step_q_gap": 0.15886926679338992,
"calib/step_q_w": 0.3478794378698225,
"calib/step_q_w_n": 676.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1198.0,
"completions/max_terminated_length": 1198.0,
"completions/mean_length": 390.640625,
"completions/mean_terminated_length": 392.1725769042969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.056483618915081024,
"kl": 0.177459716796875,
"learning_rate": 1.916666666666667e-06,
"loss": -0.0283,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.041544727981090546,
"mask/share_reasoning": 0.8425483703613281,
"mask/share_step_conf": 0.11200062930583954,
"num_tokens": 29485404.0,
"reward": 1.3231122493743896,
"reward_std": 0.23826926946640015,
"rewards/accuracy_reward_step": 0.390625,
"rewards/final_brier_reward_step": 0.7114789485931396,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.829091489315033,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.6433945894241333,
"adv/mean_abs_reasoning": 0.5586982369422913,
"adv/mean_abs_step_conf": 0.7547528743743896,
"adv/ratio_final_to_reasoning": 1.1515958828604473,
"adv/ratio_step_to_reasoning": 1.3509132917710445,
"adv/std_final_conf": 0.8603805899620056,
"adv/std_reasoning": 0.8097853064537048,
"adv/std_step_conf": 0.9360271096229553,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7476470588235294,
"calib/avg_num_step_conf": 4.27734375,
"calib/ece": 0.20990777338603434,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6324110671936759,
"calib/gap": 0.3873812636165578,
"calib/mean_conf": 0.7022661396574441,
"calib/mu_c": 0.8553812636165578,
"calib/mu_w": 0.46799999999999997,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15371541501976294,
"calib/std_conf": 0.40523718322501656,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5632342657342657,
"calib/step_q_c_n": 572.0,
"calib/step_q_gap": 0.2445589311262351,
"calib/step_q_w": 0.3186753346080306,
"calib/step_q_w_n": 523.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2175.0,
"completions/max_terminated_length": 2175.0,
"completions/mean_length": 409.47265625,
"completions/mean_terminated_length": 411.0784606933594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.1408,
"grad_norm": 0.035640228539705276,
"kl": 0.1730194091796875,
"learning_rate": 1.888888888888889e-06,
"loss": 0.0443,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0445302277803421,
"mask/share_reasoning": 0.8328328132629395,
"mask/share_step_conf": 0.11873072385787964,
"num_tokens": 29695821.0,
"reward": 1.4004900455474854,
"reward_std": 0.28041917085647583,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.763381838798523,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8602052927017212,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.7644220590591431,
"adv/mean_abs_reasoning": 0.5505396723747253,
"adv/mean_abs_step_conf": 0.7615689039230347,
"adv/ratio_final_to_reasoning": 1.3884958658144413,
"adv/ratio_step_to_reasoning": 1.3833133961773278,
"adv/std_final_conf": 0.9061704874038696,
"adv/std_reasoning": 0.7754989862442017,
"adv/std_step_conf": 0.9363312125205994,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6629651860744298,
"calib/avg_num_step_conf": 4.7890625,
"calib/ece": 0.29396812749003975,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.4342629482071713,
"calib/gap": 0.27101127117513646,
"calib/mean_conf": 0.5375059760956175,
"calib/mu_c": 0.7027040816326529,
"calib/mu_w": 0.4316928104575164,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22051792828685254,
"calib/std_conf": 0.43009438758858803,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.42600412637008384,
"calib/step_q_c_n": 517.0,
"calib/step_q_gap": 0.10147463778417776,
"calib/step_q_w": 0.3245294885859061,
"calib/step_q_w_n": 709.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2483.0,
"completions/max_terminated_length": 2483.0,
"completions/mean_length": 501.0,
"completions/mean_terminated_length": 502.9647216796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.050603996962308884,
"kl": 0.15277099609375,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.1137,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034230709075927734,
"mask/share_reasoning": 0.8573011159896851,
"mask/share_step_conf": 0.1045619398355484,
"num_tokens": 29930421.0,
"reward": 1.2970893383026123,
"reward_std": 0.3156052231788635,
"rewards/accuracy_reward_step": 0.3828125,
"rewards/final_brier_reward_step": 0.6671197414398193,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8275918960571289,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.7715365290641785,
"adv/mean_abs_reasoning": 0.6042397618293762,
"adv/mean_abs_step_conf": 0.7806218862533569,
"adv/ratio_final_to_reasoning": 1.2768714967189516,
"adv/ratio_step_to_reasoning": 1.2919075101743918,
"adv/std_final_conf": 0.9153911471366882,
"adv/std_reasoning": 0.8099719285964966,
"adv/std_step_conf": 0.9359931945800781,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.714391405043747,
"calib/avg_num_step_conf": 4.09375,
"calib/ece": 0.25439999999999996,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.44,
"calib/gap": 0.3235370560988163,
"calib/mean_conf": 0.5473600000000001,
"calib/mu_c": 0.7207758620689655,
"calib/mu_w": 0.3972388059701492,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.16887999999999995,
"calib/std_conf": 0.4274260525517835,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.44735641547861504,
"calib/step_q_c_n": 491.0,
"calib/step_q_gap": 0.12831512283947677,
"calib/step_q_w": 0.3190412926391383,
"calib/step_q_w_n": 557.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2889.0,
"completions/max_terminated_length": 2889.0,
"completions/mean_length": 491.48828125,
"completions/mean_terminated_length": 493.41571044921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.052062638103961945,
"kl": 0.14556884765625,
"learning_rate": 1.8333333333333333e-06,
"loss": 0.0223,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03546704351902008,
"mask/share_reasoning": 0.8622174263000488,
"mask/share_step_conf": 0.09840920567512512,
"num_tokens": 30165194.0,
"reward": 1.3123855590820312,
"reward_std": 0.32143712043762207,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.6979237794876099,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8212360143661499,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.7274664640426636,
"adv/mean_abs_reasoning": 0.6027958393096924,
"adv/mean_abs_step_conf": 0.7395835518836975,
"adv/ratio_final_to_reasoning": 1.2068206457359445,
"adv/ratio_step_to_reasoning": 1.226922124629545,
"adv/std_final_conf": 0.8909457325935364,
"adv/std_reasoning": 0.8100106716156006,
"adv/std_step_conf": 0.9361552000045776,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6833225178455549,
"calib/avg_num_step_conf": 4.6015625,
"calib/ece": 0.27297188755020074,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.42971887550200805,
"calib/gap": 0.2680350421804024,
"calib/mean_conf": 0.5455823293172691,
"calib/mu_c": 0.6898260869565218,
"calib/mu_w": 0.4217910447761194,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.17835341365461843,
"calib/std_conf": 0.4263713550163871,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.44290066225165564,
"calib/step_q_c_n": 453.0,
"calib/step_q_gap": 0.1375089381137246,
"calib/step_q_w": 0.30539172413793103,
"calib/step_q_w_n": 725.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2363.0,
"completions/max_terminated_length": 2363.0,
"completions/mean_length": 482.140625,
"completions/mean_terminated_length": 484.0314025878906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.144,
"grad_norm": 0.041505929082632065,
"kl": 0.151641845703125,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.0545,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03815152496099472,
"mask/share_reasoning": 0.8518142104148865,
"mask/share_step_conf": 0.1061280220746994,
"num_tokens": 30394502.0,
"reward": 1.3121750354766846,
"reward_std": 0.2834535241127014,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.6768605709075928,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8315571546554565,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.6537525653839111,
"adv/mean_abs_reasoning": 0.42274731397628784,
"adv/mean_abs_step_conf": 0.7515841722488403,
"adv/ratio_final_to_reasoning": 1.5464381292806524,
"adv/ratio_step_to_reasoning": 1.7778567654979758,
"adv/std_final_conf": 0.8735095262527466,
"adv/std_reasoning": 0.701313853263855,
"adv/std_step_conf": 0.9356544613838196,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7858732277957912,
"calib/avg_num_step_conf": 5.02734375,
"calib/ece": 0.1786220472440945,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3188976377952756,
"calib/gap": 0.4211068726556043,
"calib/mean_conf": 0.4246062992125984,
"calib/mu_c": 0.6683177570093458,
"calib/mu_w": 0.24721088435374153,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09098425196850395,
"calib/std_conf": 0.42097239622749233,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.45806739130434787,
"calib/step_q_c_n": 460.0,
"calib/step_q_gap": 0.22600596446033333,
"calib/step_q_w": 0.23206142684401454,
"calib/step_q_w_n": 827.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2804.0,
"completions/max_terminated_length": 2804.0,
"completions/mean_length": 457.07421875,
"completions/mean_terminated_length": 457.07421875,
"completions/min_length": 95.0,
"completions/min_terminated_length": 95.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.05797513574361801,
"kl": 0.1590728759765625,
"learning_rate": 1.777777777777778e-06,
"loss": 0.0713,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03888920694589615,
"mask/share_reasoning": 0.8414955139160156,
"mask/share_step_conf": 0.11961531639099121,
"num_tokens": 30620001.0,
"reward": 1.4119293689727783,
"reward_std": 0.20032727718353271,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.7781753540039062,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8818260431289673,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.7331190705299377,
"adv/mean_abs_reasoning": 0.6162766814231873,
"adv/mean_abs_step_conf": 0.7503564953804016,
"adv/ratio_final_to_reasoning": 1.1895940453838407,
"adv/ratio_step_to_reasoning": 1.2175643148586761,
"adv/std_final_conf": 0.9069944620132446,
"adv/std_reasoning": 0.858950138092041,
"adv/std_step_conf": 0.9364016056060791,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6668528517346244,
"calib/avg_num_step_conf": 4.76171875,
"calib/ece": 0.2797456692913386,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4015748031496063,
"calib/gap": 0.24683139080245753,
"calib/mean_conf": 0.5051362204724409,
"calib/mu_c": 0.6324390243902438,
"calib/mu_w": 0.38560763358778627,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15031496062992125,
"calib/std_conf": 0.4260757041840703,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.39144387755102045,
"calib/step_q_c_n": 588.0,
"calib/step_q_gap": 0.09878999482518841,
"calib/step_q_w": 0.29265388272583204,
"calib/step_q_w_n": 631.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1466.0,
"completions/max_terminated_length": 1466.0,
"completions/mean_length": 418.046875,
"completions/mean_terminated_length": 419.6863098144531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.03714418783783913,
"kl": 0.16357421875,
"learning_rate": 1.75e-06,
"loss": -0.0739,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03954150900244713,
"mask/share_reasoning": 0.8390306234359741,
"mask/share_step_conf": 0.11752159893512726,
"num_tokens": 30834005.0,
"reward": 1.3112924098968506,
"reward_std": 0.3159603476524353,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.6747555136680603,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.827820897102356,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.7053368091583252,
"adv/mean_abs_reasoning": 0.6283121109008789,
"adv/mean_abs_step_conf": 0.7511765956878662,
"adv/ratio_final_to_reasoning": 1.1225898672349441,
"adv/ratio_step_to_reasoning": 1.1955468988347577,
"adv/std_final_conf": 0.8911412358283997,
"adv/std_reasoning": 0.8100284934043884,
"adv/std_step_conf": 0.9361639022827148,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7114058703788824,
"calib/avg_num_step_conf": 4.328125,
"calib/ece": 0.26853018372703413,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.4251968503937008,
"calib/gap": 0.3181723466193885,
"calib/mean_conf": 0.5242257217847768,
"calib/mu_c": 0.6457324840764331,
"calib/mu_w": 0.32756013745704465,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08732283464566928,
"calib/std_conf": 0.4289380259018184,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41454853620955323,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.09478295886750532,
"calib/step_q_w": 0.3197655773420479,
"calib/step_q_w_n": 459.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2229.0,
"completions/max_terminated_length": 2229.0,
"completions/mean_length": 434.875,
"completions/mean_terminated_length": 436.5804138183594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.1472,
"grad_norm": 0.06317199766635895,
"kl": 0.158172607421875,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.0748,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04075439274311066,
"mask/share_reasoning": 0.8472160696983337,
"mask/share_step_conf": 0.1081232875585556,
"num_tokens": 31049669.0,
"reward": 1.3243135213851929,
"reward_std": 0.27699100971221924,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7157214879989624,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8059059381484985,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.7152689695358276,
"adv/mean_abs_reasoning": 0.5647158622741699,
"adv/mean_abs_step_conf": 0.7506711483001709,
"adv/ratio_final_to_reasoning": 1.266599749217889,
"adv/ratio_step_to_reasoning": 1.3292899995355887,
"adv/std_final_conf": 0.8902831077575684,
"adv/std_reasoning": 0.7928001880645752,
"adv/std_step_conf": 0.9358660578727722,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.722875451038945,
"calib/avg_num_step_conf": 4.515625,
"calib/ece": 0.24751372549019607,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.3411764705882353,
"calib/gap": 0.3153568495707354,
"calib/mean_conf": 0.46291764705882354,
"calib/mu_c": 0.6039007092198582,
"calib/mu_w": 0.28854385964912277,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07874509803921569,
"calib/std_conf": 0.41096522094434734,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.43868451612903225,
"calib/step_q_c_n": 620.0,
"calib/step_q_gap": 0.1349643668753009,
"calib/step_q_w": 0.30372014925373136,
"calib/step_q_w_n": 536.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1068.0,
"completions/max_terminated_length": 1068.0,
"completions/mean_length": 402.203125,
"completions/mean_terminated_length": 403.7804260253906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.057961318641901016,
"kl": 0.1760101318359375,
"learning_rate": 1.6944444444444446e-06,
"loss": -0.0217,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0406876876950264,
"mask/share_reasoning": 0.837753415107727,
"mask/share_step_conf": 0.11765265464782715,
"num_tokens": 31255729.0,
"reward": 1.3755717277526855,
"reward_std": 0.22052976489067078,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7288585305213928,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8564548492431641,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.5774595737457275,
"adv/mean_abs_reasoning": 0.5195547342300415,
"adv/mean_abs_step_conf": 0.7685931921005249,
"adv/ratio_final_to_reasoning": 1.1114508938148713,
"adv/ratio_step_to_reasoning": 1.4793305526116474,
"adv/std_final_conf": 0.8099300265312195,
"adv/std_reasoning": 0.7394070625305176,
"adv/std_step_conf": 0.9359972476959229,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.781879630833225,
"calib/avg_num_step_conf": 4.16796875,
"calib/ece": 0.19671764705882355,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5254901960784314,
"calib/gap": 0.43335928766411,
"calib/mean_conf": 0.6247725490196079,
"calib/mu_c": 0.7913184713375795,
"calib/mu_w": 0.35795918367346946,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.10290196078431375,
"calib/std_conf": 0.4220057466751322,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.49226213592233015,
"calib/step_q_c_n": 618.0,
"calib/step_q_gap": 0.15976547667956847,
"calib/step_q_w": 0.33249665924276167,
"calib/step_q_w_n": 449.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1566.0,
"completions/max_terminated_length": 1566.0,
"completions/mean_length": 425.296875,
"completions/mean_terminated_length": 426.9647216796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.051009099930524826,
"kl": 0.157012939453125,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0003,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.040174562484025955,
"mask/share_reasoning": 0.8454655408859253,
"mask/share_step_conf": 0.11045361310243607,
"num_tokens": 31469621.0,
"reward": 1.408362627029419,
"reward_std": 0.21559756994247437,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7836502194404602,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8559907674789429,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.6383939981460571,
"adv/mean_abs_reasoning": 0.46379637718200684,
"adv/mean_abs_step_conf": 0.7589306831359863,
"adv/ratio_final_to_reasoning": 1.3764531797874162,
"adv/ratio_step_to_reasoning": 1.6363445694578171,
"adv/std_final_conf": 0.8568813800811768,
"adv/std_reasoning": 0.7392438054084778,
"adv/std_step_conf": 0.9359174370765686,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.834562868051511,
"calib/avg_num_step_conf": 4.44921875,
"calib/ece": 0.14496062992125985,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.468503937007874,
"calib/gap": 0.5326189089497185,
"calib/mean_conf": 0.5637007874015748,
"calib/mu_c": 0.7754901960784314,
"calib/mu_w": 0.2428712871287129,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.05314960629921259,
"calib/std_conf": 0.431495847462444,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.44982689335394127,
"calib/step_q_c_n": 647.0,
"calib/step_q_gap": 0.1612394949799575,
"calib/step_q_w": 0.2885873983739838,
"calib/step_q_w_n": 492.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2859.0,
"completions/max_terminated_length": 2859.0,
"completions/mean_length": 461.86328125,
"completions/mean_terminated_length": 461.86328125,
"completions/min_length": 84.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.1504,
"grad_norm": 0.04535761475563049,
"kl": 0.1490936279296875,
"learning_rate": 1.638888888888889e-06,
"loss": 0.0648,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03858054429292679,
"mask/share_reasoning": 0.8557083606719971,
"mask/share_step_conf": 0.10571112483739853,
"num_tokens": 31694954.0,
"reward": 1.435590147972107,
"reward_std": 0.21061460673809052,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.8199445009231567,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8670241236686707,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.696391224861145,
"adv/mean_abs_reasoning": 0.43528157472610474,
"adv/mean_abs_step_conf": 0.7312085032463074,
"adv/ratio_final_to_reasoning": 1.5998637785193186,
"adv/ratio_step_to_reasoning": 1.6798517228908916,
"adv/std_final_conf": 0.8764922618865967,
"adv/std_reasoning": 0.7204663753509521,
"adv/std_step_conf": 0.9357934594154358,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7650992234685073,
"calib/avg_num_step_conf": 4.515625,
"calib/ece": 0.20349411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.403921568627451,
"calib/gap": 0.41327640823369904,
"calib/mean_conf": 0.5069372549019608,
"calib/mu_c": 0.7046616541353383,
"calib/mu_w": 0.2913852459016393,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09443137254901962,
"calib/std_conf": 0.4355757469799832,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.47131958762886594,
"calib/step_q_c_n": 582.0,
"calib/step_q_gap": 0.1915607026114443,
"calib/step_q_w": 0.27975888501742163,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1138.0,
"completions/max_terminated_length": 1138.0,
"completions/mean_length": 435.87109375,
"completions/mean_terminated_length": 437.5804138183594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.03959168866276741,
"kl": 0.1544036865234375,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.0305,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03837796300649643,
"mask/share_reasoning": 0.8447139263153076,
"mask/share_step_conf": 0.11300183832645416,
"num_tokens": 31911697.0,
"reward": 1.386702299118042,
"reward_std": 0.22725032269954681,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.76378333568573,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8532482385635376,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.6530799865722656,
"adv/mean_abs_reasoning": 0.49441465735435486,
"adv/mean_abs_step_conf": 0.7745805978775024,
"adv/ratio_final_to_reasoning": 1.3209155045421577,
"adv/ratio_step_to_reasoning": 1.5666618826034282,
"adv/std_final_conf": 0.8723351955413818,
"adv/std_reasoning": 0.7576212286949158,
"adv/std_step_conf": 0.9359222054481506,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8299148606811146,
"calib/avg_num_step_conf": 5.40234375,
"calib/ece": 0.17129200000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.452,
"calib/gap": 0.5069794891640866,
"calib/mean_conf": 0.5489400000000001,
"calib/mu_c": 0.8247368421052632,
"calib/mu_w": 0.3177573529411765,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13211600000000004,
"calib/std_conf": 0.4300814625161145,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.46212589928057557,
"calib/step_q_c_n": 556.0,
"calib/step_q_gap": 0.20496793608831437,
"calib/step_q_w": 0.2571579631922612,
"calib/step_q_w_n": 827.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2599.0,
"completions/max_terminated_length": 2599.0,
"completions/mean_length": 459.6484375,
"completions/mean_terminated_length": 459.6484375,
"completions/min_length": 132.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.053450778126716614,
"kl": 0.155853271484375,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.0157,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03666718304157257,
"mask/share_reasoning": 0.8401566743850708,
"mask/share_step_conf": 0.12317609786987305,
"num_tokens": 32136703.0,
"reward": 1.3771334886550903,
"reward_std": 0.2412530779838562,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.7908738255500793,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8395090103149414,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.6271746754646301,
"adv/mean_abs_reasoning": 0.499098002910614,
"adv/mean_abs_step_conf": 0.7657457590103149,
"adv/ratio_final_to_reasoning": 1.2566162793822158,
"adv/ratio_step_to_reasoning": 1.5342593128898099,
"adv/std_final_conf": 0.8462645411491394,
"adv/std_reasoning": 0.7575715184211731,
"adv/std_step_conf": 0.9356332421302795,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.692873892652423,
"calib/avg_num_step_conf": 4.41796875,
"calib/ece": 0.2912648221343873,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5059288537549407,
"calib/gap": 0.3031103439291297,
"calib/mean_conf": 0.5765612648221344,
"calib/mu_c": 0.6975657894736842,
"calib/mu_w": 0.3944554455445545,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13351778656126478,
"calib/std_conf": 0.44622664042621685,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.47541877579530867,
"calib/step_q_c_n": 648.0,
"calib/step_q_gap": 0.1618338896669443,
"calib/step_q_w": 0.3135848861283644,
"calib/step_q_w_n": 483.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2372.0,
"completions/max_terminated_length": 2372.0,
"completions/mean_length": 421.4921875,
"completions/mean_terminated_length": 423.1451110839844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 78.0,
"epoch": 0.1536,
"grad_norm": 0.06617000699043274,
"kl": 0.21453857421875,
"learning_rate": 1.5555555555555558e-06,
"loss": 0.0071,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.040411271154880524,
"mask/share_reasoning": 0.840194821357727,
"mask/share_step_conf": 0.11548765003681183,
"num_tokens": 32348733.0,
"reward": 1.3366563320159912,
"reward_std": 0.23944231867790222,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.697578489780426,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.829663872718811,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.7166906595230103,
"adv/mean_abs_reasoning": 0.5873551368713379,
"adv/mean_abs_step_conf": 0.7748841047286987,
"adv/ratio_final_to_reasoning": 1.2201998663714824,
"adv/ratio_step_to_reasoning": 1.3192769690519275,
"adv/std_final_conf": 0.8760490417480469,
"adv/std_reasoning": 0.8265672326087952,
"adv/std_step_conf": 0.9361008405685425,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7301093514328808,
"calib/avg_num_step_conf": 4.65625,
"calib/ece": 0.2123162055335969,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.49407114624505927,
"calib/gap": 0.3609771241830065,
"calib/mean_conf": 0.6051541501976284,
"calib/mu_c": 0.7720882352941176,
"calib/mu_w": 0.41111111111111115,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.13996047430830044,
"calib/std_conf": 0.41960309209280505,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.45034983922829586,
"calib/step_q_c_n": 622.0,
"calib/step_q_gap": 0.09043054098268183,
"calib/step_q_w": 0.359919298245614,
"calib/step_q_w_n": 570.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1856.0,
"completions/max_terminated_length": 1856.0,
"completions/mean_length": 411.8671875,
"completions/mean_terminated_length": 413.4823913574219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.050251394510269165,
"kl": 0.156768798828125,
"learning_rate": 1.527777777777778e-06,
"loss": 0.001,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.04229770600795746,
"mask/share_reasoning": 0.8285897970199585,
"mask/share_step_conf": 0.12520626187324524,
"num_tokens": 32556875.0,
"reward": 1.3466042280197144,
"reward_std": 0.2541048228740692,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7375452518463135,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8262690305709839,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.7096195220947266,
"adv/mean_abs_reasoning": 0.569200873374939,
"adv/mean_abs_step_conf": 0.7494736909866333,
"adv/ratio_final_to_reasoning": 1.2466943662387746,
"adv/ratio_step_to_reasoning": 1.316712124039463,
"adv/std_final_conf": 0.8874281048774719,
"adv/std_reasoning": 0.7928953170776367,
"adv/std_step_conf": 0.9355780482292175,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7482400268186389,
"calib/avg_num_step_conf": 4.4609375,
"calib/ece": 0.22743650793650794,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.4523809523809524,
"calib/gap": 0.413036540395575,
"calib/mean_conf": 0.5476190476190477,
"calib/mu_c": 0.8049473684210527,
"calib/mu_w": 0.3919108280254777,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1990357142857143,
"calib/std_conf": 0.43866439348706965,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5529746835443038,
"calib/step_q_c_n": 395.0,
"calib/step_q_gap": 0.21941967685086333,
"calib/step_q_w": 0.33355500669344046,
"calib/step_q_w_n": 747.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1983.0,
"completions/max_terminated_length": 1983.0,
"completions/mean_length": 436.28515625,
"completions/mean_terminated_length": 437.99609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.04094357788562775,
"kl": 0.159423828125,
"learning_rate": 1.5e-06,
"loss": -0.0788,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03925366699695587,
"mask/share_reasoning": 0.8443082571029663,
"mask/share_step_conf": 0.11253180354833603,
"num_tokens": 32775780.0,
"reward": 1.3252230882644653,
"reward_std": 0.2993590831756592,
"rewards/accuracy_reward_step": 0.37109375,
"rewards/final_brier_reward_step": 0.726082444190979,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.82663494348526,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.705093502998352,
"adv/mean_abs_reasoning": 0.47931668162345886,
"adv/mean_abs_step_conf": 0.775804877281189,
"adv/ratio_final_to_reasoning": 1.471038939454769,
"adv/ratio_step_to_reasoning": 1.6185643167133603,
"adv/std_final_conf": 0.8910297155380249,
"adv/std_reasoning": 0.7393829226493835,
"adv/std_step_conf": 0.9361888766288757,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7350313693398799,
"calib/avg_num_step_conf": 4.5703125,
"calib/ece": 0.29897333333333337,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.56,
"calib/gap": 0.40094426259319893,
"calib/mean_conf": 0.6338533333333333,
"calib/mu_c": 0.8840425531914895,
"calib/mu_w": 0.48309829059829057,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2784133333333334,
"calib/std_conf": 0.4330953662493593,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5375308641975309,
"calib/step_q_c_n": 405.0,
"calib/step_q_gap": 0.18174968772694272,
"calib/step_q_w": 0.3557811764705882,
"calib/step_q_w_n": 765.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2535.0,
"completions/max_terminated_length": 2535.0,
"completions/mean_length": 450.05078125,
"completions/mean_terminated_length": 450.05078125,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.1568,
"grad_norm": 0.04447196424007416,
"kl": 0.1577301025390625,
"learning_rate": 1.4722222222222225e-06,
"loss": -0.0047,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.037693917751312256,
"mask/share_reasoning": 0.8485679626464844,
"mask/share_step_conf": 0.11373814940452576,
"num_tokens": 32994673.0,
"reward": 1.3036630153656006,
"reward_std": 0.31476935744285583,
"rewards/accuracy_reward_step": 0.3671875,
"rewards/final_brier_reward_step": 0.6830648183822632,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8277555704116821,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.5962350368499756,
"adv/mean_abs_reasoning": 0.5363343954086304,
"adv/mean_abs_step_conf": 0.7551847100257874,
"adv/ratio_final_to_reasoning": 1.111685250757985,
"adv/ratio_step_to_reasoning": 1.4080482558841225,
"adv/std_final_conf": 0.8239307403564453,
"adv/std_reasoning": 0.7576537728309631,
"adv/std_step_conf": 0.9358545541763306,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7532467532467533,
"calib/avg_num_step_conf": 4.51171875,
"calib/ece": 0.21653968253968264,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5515873015873016,
"calib/gap": 0.420508348794063,
"calib/mean_conf": 0.6239365079365078,
"calib/mu_c": 0.7874675324675324,
"calib/mu_w": 0.3669591836734694,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.11468253968253977,
"calib/std_conf": 0.43631919555894205,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5317462462462463,
"calib/step_q_c_n": 666.0,
"calib/step_q_gap": 0.224292258516185,
"calib/step_q_w": 0.30745398773006133,
"calib/step_q_w_n": 489.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1958.0,
"completions/max_terminated_length": 1958.0,
"completions/mean_length": 420.18359375,
"completions/mean_terminated_length": 421.8313903808594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.038488779217004776,
"kl": 0.1638641357421875,
"learning_rate": 1.4444444444444445e-06,
"loss": 0.0168,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04403883218765259,
"mask/share_reasoning": 0.8297275304794312,
"mask/share_step_conf": 0.12232742458581924,
"num_tokens": 33207352.0,
"reward": 1.3929460048675537,
"reward_std": 0.2436758577823639,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7596205472946167,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8541513085365295,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.6414650678634644,
"adv/mean_abs_reasoning": 0.5006125569343567,
"adv/mean_abs_step_conf": 0.7623882293701172,
"adv/ratio_final_to_reasoning": 1.281360323423883,
"adv/ratio_step_to_reasoning": 1.5229107196967215,
"adv/std_final_conf": 0.8594561219215393,
"adv/std_reasoning": 0.7575691342353821,
"adv/std_step_conf": 0.9285663366317749,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7966791979949874,
"calib/avg_num_step_conf": 4.88671875,
"calib/ece": 0.18441897233201587,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5098814229249012,
"calib/gap": 0.5147541353383458,
"calib/mean_conf": 0.587501976284585,
"calib/mu_c": 0.8316541353383459,
"calib/mu_w": 0.3169,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12311462450592892,
"calib/std_conf": 0.4446132806874793,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4966707530647986,
"calib/step_q_c_n": 571.0,
"calib/step_q_gap": 0.20092957659421035,
"calib/step_q_w": 0.29574117647058823,
"calib/step_q_w_n": 680.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2772.0,
"completions/max_terminated_length": 2772.0,
"completions/mean_length": 485.91796875,
"completions/mean_terminated_length": 487.82354736328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.07420997321605682,
"kl": 0.135040283203125,
"learning_rate": 1.4166666666666667e-06,
"loss": 0.0254,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.038157302886247635,
"mask/share_reasoning": 0.843529462814331,
"mask/share_step_conf": 0.11440698057413101,
"num_tokens": 33436203.0,
"reward": 1.431693434715271,
"reward_std": 0.2294609248638153,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.796412467956543,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8827059268951416,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.7208597660064697,
"adv/mean_abs_reasoning": 0.5321922302246094,
"adv/mean_abs_step_conf": 0.7479966282844543,
"adv/ratio_final_to_reasoning": 1.354510128233616,
"adv/ratio_step_to_reasoning": 1.4055008431234812,
"adv/std_final_conf": 0.8971462845802307,
"adv/std_reasoning": 0.7754176259040833,
"adv/std_step_conf": 0.9362186193466187,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7468057366362449,
"calib/avg_num_step_conf": 4.484375,
"calib/ece": 0.22217580645161294,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6048387096774194,
"calib/gap": 0.3900070404172099,
"calib/mean_conf": 0.6926629032258064,
"calib/mu_c": 0.8782307692307693,
"calib/mu_w": 0.48822372881355935,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.19532258064516134,
"calib/std_conf": 0.4085760559094708,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5420354477611941,
"calib/step_q_c_n": 536.0,
"calib/step_q_gap": 0.19873261551718535,
"calib/step_q_w": 0.3433028322440087,
"calib/step_q_w_n": 612.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2437.0,
"completions/max_terminated_length": 2437.0,
"completions/mean_length": 406.93359375,
"completions/mean_terminated_length": 413.39288330078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 90.0,
"epoch": 0.16,
"grad_norm": 0.059314560145139694,
"kl": 0.1519622802734375,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.0439,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.044302038848400116,
"mask/share_reasoning": 0.8175007104873657,
"mask/share_step_conf": 0.12257222831249237,
"num_tokens": 33645338.0,
"reward": 1.3415672779083252,
"reward_std": 0.32355982065200806,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.726384162902832,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.830718994140625,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.7061728835105896,
"adv/mean_abs_reasoning": 0.4900563955307007,
"adv/mean_abs_step_conf": 0.7467561960220337,
"adv/ratio_final_to_reasoning": 1.4410033007442913,
"adv/ratio_step_to_reasoning": 1.5238168562484387,
"adv/std_final_conf": 0.8989728689193726,
"adv/std_reasoning": 0.7393714785575867,
"adv/std_step_conf": 0.9360334873199463,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7790909090909091,
"calib/avg_num_step_conf": 4.35546875,
"calib/ece": 0.2163453815261044,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.44176706827309237,
"calib/gap": 0.4591393939393939,
"calib/mean_conf": 0.5140160642570282,
"calib/mu_c": 0.7906060606060605,
"calib/mu_w": 0.33146666666666663,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16638554216867468,
"calib/std_conf": 0.45318765644131903,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5401400491400491,
"calib/step_q_c_n": 407.0,
"calib/step_q_gap": 0.22112027512874965,
"calib/step_q_w": 0.31901977401129944,
"calib/step_q_w_n": 708.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2787.0,
"completions/max_terminated_length": 2787.0,
"completions/mean_length": 482.5234375,
"completions/mean_terminated_length": 484.41571044921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.04952695593237877,
"kl": 0.127716064453125,
"learning_rate": 1.3611111111111112e-06,
"loss": 0.0168,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.0359514057636261,
"mask/share_reasoning": 0.8559660911560059,
"mask/share_step_conf": 0.10417624562978745,
"num_tokens": 33875888.0,
"reward": 1.3257105350494385,
"reward_std": 0.28718993067741394,
"rewards/accuracy_reward_step": 0.38671875,
"rewards/final_brier_reward_step": 0.7406706809997559,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8194376826286316,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.7186774611473083,
"adv/mean_abs_reasoning": 0.582579493522644,
"adv/mean_abs_step_conf": 0.7576056718826294,
"adv/ratio_final_to_reasoning": 1.233612698589389,
"adv/ratio_step_to_reasoning": 1.3004331259613455,
"adv/std_final_conf": 0.9069194197654724,
"adv/std_reasoning": 0.8265808820724487,
"adv/std_step_conf": 0.9362088441848755,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7519455868396077,
"calib/avg_num_step_conf": 4.4375,
"calib/ece": 0.22714566929133856,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.452755905511811,
"calib/gap": 0.4167751344511232,
"calib/mean_conf": 0.5478937007874015,
"calib/mu_c": 0.785816513761468,
"calib/mu_w": 0.3690413793103448,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1729527559055118,
"calib/std_conf": 0.44473848871684885,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5037935483870968,
"calib/step_q_c_n": 465.0,
"calib/step_q_gap": 0.1702492854809679,
"calib/step_q_w": 0.3335442629061289,
"calib/step_q_w_n": 671.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1562.0,
"completions/max_terminated_length": 1562.0,
"completions/mean_length": 448.5703125,
"completions/mean_terminated_length": 450.3294372558594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.053616829216480255,
"kl": 0.1596527099609375,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.0532,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.039214253425598145,
"mask/share_reasoning": 0.8418387770652771,
"mask/share_step_conf": 0.11504073441028595,
"num_tokens": 34096114.0,
"reward": 1.3434820175170898,
"reward_std": 0.26486706733703613,
"rewards/accuracy_reward_step": 0.42578125,
"rewards/final_brier_reward_step": 0.7340766191482544,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8354280591011047,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.6649273037910461,
"adv/mean_abs_reasoning": 0.48268380761146545,
"adv/mean_abs_step_conf": 0.7543078660964966,
"adv/ratio_final_to_reasoning": 1.3775628958456319,
"adv/ratio_step_to_reasoning": 1.5627370427633527,
"adv/std_final_conf": 0.8761218786239624,
"adv/std_reasoning": 0.7393322587013245,
"adv/std_step_conf": 0.9358770847320557,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6802699433373655,
"calib/avg_num_step_conf": 4.64453125,
"calib/ece": 0.31198412698412703,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.46825396825396826,
"calib/gap": 0.28423505443432867,
"calib/mean_conf": 0.5495238095238096,
"calib/mu_c": 0.676978417266187,
"calib/mu_w": 0.39274336283185834,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1549603174603175,
"calib/std_conf": 0.449749237261425,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4632719412724307,
"calib/step_q_c_n": 613.0,
"calib/step_q_gap": 0.15083617738354183,
"calib/step_q_w": 0.3124357638888889,
"calib/step_q_w_n": 576.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2646.0,
"completions/max_terminated_length": 2646.0,
"completions/mean_length": 453.34765625,
"completions/mean_terminated_length": 455.1255187988281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.1632,
"grad_norm": 0.04672946408390999,
"kl": 0.1456146240234375,
"learning_rate": 1.3055555555555556e-06,
"loss": -0.0194,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.037633031606674194,
"mask/share_reasoning": 0.8455663919448853,
"mask/share_step_conf": 0.11289433389902115,
"num_tokens": 34319491.0,
"reward": 1.3227565288543701,
"reward_std": 0.26125621795654297,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6762843132019043,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.832270622253418,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.6391319036483765,
"adv/mean_abs_reasoning": 0.5661917924880981,
"adv/mean_abs_step_conf": 0.76639723777771,
"adv/ratio_final_to_reasoning": 1.1288258009529724,
"adv/ratio_step_to_reasoning": 1.3536000485097466,
"adv/std_final_conf": 0.8594533205032349,
"adv/std_reasoning": 0.8097780346870422,
"adv/std_step_conf": 0.9359622597694397,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7422602567329475,
"calib/avg_num_step_conf": 3.99609375,
"calib/ece": 0.2400395256916996,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5138339920948617,
"calib/gap": 0.4294601057135666,
"calib/mean_conf": 0.5810671936758893,
"calib/mu_c": 0.8136206896551724,
"calib/mu_w": 0.38416058394160585,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18130434782608695,
"calib/std_conf": 0.45757658534215817,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5460468384074941,
"calib/step_q_c_n": 427.0,
"calib/step_q_gap": 0.19163912028668878,
"calib/step_q_w": 0.35440771812080535,
"calib/step_q_w_n": 596.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2351.0,
"completions/max_terminated_length": 2351.0,
"completions/mean_length": 413.70703125,
"completions/mean_terminated_length": 415.3294372558594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.047010958194732666,
"kl": 0.16094970703125,
"learning_rate": 1.2777777777777779e-06,
"loss": 0.0255,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0411284863948822,
"mask/share_reasoning": 0.8461230993270874,
"mask/share_step_conf": 0.10884217172861099,
"num_tokens": 34529840.0,
"reward": 1.3319199085235596,
"reward_std": 0.2809707522392273,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.7279886603355408,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8241755366325378,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.697257399559021,
"adv/mean_abs_reasoning": 0.39795076847076416,
"adv/mean_abs_step_conf": 0.7744704484939575,
"adv/ratio_final_to_reasoning": 1.7521197464661906,
"adv/ratio_step_to_reasoning": 1.9461463825539886,
"adv/std_final_conf": 0.8761575222015381,
"adv/std_reasoning": 0.6815242171287537,
"adv/std_step_conf": 0.9362286329269409,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7283950617283951,
"calib/avg_num_step_conf": 4.15234375,
"calib/ece": 0.26471764705882356,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.43137254901960786,
"calib/gap": 0.357225925925926,
"calib/mean_conf": 0.49904705882352945,
"calib/mu_c": 0.6881666666666667,
"calib/mu_w": 0.3309407407407407,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14658823529411769,
"calib/std_conf": 0.458563816993944,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.47445479999999995,
"calib/step_q_c_n": 500.0,
"calib/step_q_gap": 0.13788641634103016,
"calib/step_q_w": 0.3365683836589698,
"calib/step_q_w_n": 563.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1732.0,
"completions/max_terminated_length": 1732.0,
"completions/mean_length": 396.61328125,
"completions/mean_terminated_length": 396.61328125,
"completions/min_length": 119.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.06110754236578941,
"kl": 0.1593475341796875,
"learning_rate": 1.25e-06,
"loss": -0.0123,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.041939422488212585,
"mask/share_reasoning": 0.8429463505744934,
"mask/share_step_conf": 0.11511427164077759,
"num_tokens": 34738589.0,
"reward": 1.3121984004974365,
"reward_std": 0.22575721144676208,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7149654626846313,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8082312345504761,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.6453638076782227,
"adv/mean_abs_reasoning": 0.533676028251648,
"adv/mean_abs_step_conf": 0.7646842002868652,
"adv/ratio_final_to_reasoning": 1.2092801128663584,
"adv/ratio_step_to_reasoning": 1.4328621856822252,
"adv/std_final_conf": 0.8599355816841125,
"adv/std_reasoning": 0.7576324343681335,
"adv/std_step_conf": 0.9362171292304993,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6508128016256033,
"calib/avg_num_step_conf": 4.83203125,
"calib/ece": 0.3132270916334661,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4860557768924303,
"calib/gap": 0.28091503683007363,
"calib/mean_conf": 0.5432669322709163,
"calib/mu_c": 0.6854032258064516,
"calib/mu_w": 0.404488188976378,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18123505976095616,
"calib/std_conf": 0.4599130400895085,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4531105263157894,
"calib/step_q_c_n": 570.0,
"calib/step_q_gap": 0.11236488913438014,
"calib/step_q_w": 0.34074563718140927,
"calib/step_q_w_n": 667.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2817.0,
"completions/max_terminated_length": 2817.0,
"completions/mean_length": 468.01171875,
"completions/mean_terminated_length": 469.8470764160156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.1664,
"grad_norm": 0.06579665839672089,
"kl": 0.148345947265625,
"learning_rate": 1.2222222222222223e-06,
"loss": -0.0088,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.038678042590618134,
"mask/share_reasoning": 0.8419922590255737,
"mask/share_step_conf": 0.11542340368032455,
"num_tokens": 34963160.0,
"reward": 1.3079309463500977,
"reward_std": 0.27311986684799194,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.6633148193359375,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8297890424728394,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.6082822680473328,
"adv/mean_abs_reasoning": 0.5066467523574829,
"adv/mean_abs_step_conf": 0.7343960404396057,
"adv/ratio_final_to_reasoning": 1.2006042972089106,
"adv/ratio_step_to_reasoning": 1.4495228421427364,
"adv/std_final_conf": 0.8255444169044495,
"adv/std_reasoning": 0.7575724720954895,
"adv/std_step_conf": 0.9362667798995972,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.776601206520344,
"calib/avg_num_step_conf": 5.015625,
"calib/ece": 0.22458498023715412,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5691699604743083,
"calib/gap": 0.49837889872930285,
"calib/mean_conf": 0.602213438735178,
"calib/mu_c": 0.8110204081632651,
"calib/mu_w": 0.3126415094339623,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.12288537549407111,
"calib/std_conf": 0.4677376400274536,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5158425925925926,
"calib/step_q_c_n": 756.0,
"calib/step_q_gap": 0.1860958122895623,
"calib/step_q_w": 0.3297467803030303,
"calib/step_q_w_n": 528.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2750.0,
"completions/max_terminated_length": 2750.0,
"completions/mean_length": 470.8671875,
"completions/mean_terminated_length": 470.8671875,
"completions/min_length": 134.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.04144881293177605,
"kl": 0.132781982421875,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.045,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03978681564331055,
"mask/share_reasoning": 0.8324368000030518,
"mask/share_step_conf": 0.1277763694524765,
"num_tokens": 35187430.0,
"reward": 1.367426872253418,
"reward_std": 0.2844254970550537,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.769758939743042,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8266879916191101,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.687097430229187,
"adv/mean_abs_reasoning": 0.5489099621772766,
"adv/mean_abs_step_conf": 0.7720958590507507,
"adv/ratio_final_to_reasoning": 1.2517488797320848,
"adv/ratio_step_to_reasoning": 1.4065983717770343,
"adv/std_final_conf": 0.8751606941223145,
"adv/std_reasoning": 0.7576378583908081,
"adv/std_step_conf": 0.9359956979751587,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6282976684765251,
"calib/avg_num_step_conf": 4.50390625,
"calib/ece": 0.33855468749999995,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.59765625,
"calib/gap": 0.22169977642925587,
"calib/mean_conf": 0.6432421875,
"calib/mu_c": 0.7307096774193549,
"calib/mu_w": 0.509009900990099,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18816406249999995,
"calib/std_conf": 0.4489902255007505,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5108185628742515,
"calib/step_q_c_n": 668.0,
"calib/step_q_gap": 0.1338536144206433,
"calib/step_q_w": 0.37696494845360823,
"calib/step_q_w_n": 485.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1232.0,
"completions/max_terminated_length": 1232.0,
"completions/mean_length": 405.16796875,
"completions/mean_terminated_length": 406.75689697265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.0779474750161171,
"kl": 0.1684417724609375,
"learning_rate": 1.1666666666666668e-06,
"loss": 0.0508,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.04238620772957802,
"mask/share_reasoning": 0.8300025463104248,
"mask/share_step_conf": 0.12370499968528748,
"num_tokens": 35396393.0,
"reward": 1.3117878437042236,
"reward_std": 0.2755977511405945,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.6640223264694214,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8192297220230103,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.6682296395301819,
"adv/mean_abs_reasoning": 0.5617543458938599,
"adv/mean_abs_step_conf": 0.7508109211921692,
"adv/ratio_final_to_reasoning": 1.189540667401334,
"adv/ratio_step_to_reasoning": 1.3365467070797357,
"adv/std_final_conf": 0.8761033415794373,
"adv/std_reasoning": 0.7928183078765869,
"adv/std_step_conf": 0.9362561702728271,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7296401515151515,
"calib/avg_num_step_conf": 4.28515625,
"calib/ece": 0.25996031746031745,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5515873015873016,
"calib/gap": 0.411590909090909,
"calib/mean_conf": 0.5977380952380952,
"calib/mu_c": 0.8133333333333332,
"calib/mu_w": 0.40174242424242423,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1907539682539682,
"calib/std_conf": 0.4621413698118899,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.571676659528908,
"calib/step_q_c_n": 467.0,
"calib/step_q_gap": 0.21274173889398734,
"calib/step_q_w": 0.35893492063492066,
"calib/step_q_w_n": 630.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1673.0,
"completions/max_terminated_length": 1673.0,
"completions/mean_length": 417.32421875,
"completions/mean_terminated_length": 418.9608154296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.1696,
"grad_norm": 0.05492505058646202,
"kl": 0.1497039794921875,
"learning_rate": 1.138888888888889e-06,
"loss": -0.0204,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.04090462625026703,
"mask/share_reasoning": 0.8396684527397156,
"mask/share_step_conf": 0.11552062630653381,
"num_tokens": 35608012.0,
"reward": 1.3436503410339355,
"reward_std": 0.2901962697505951,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7161792516708374,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8406387567520142,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.6749417185783386,
"adv/mean_abs_reasoning": 0.6084262132644653,
"adv/mean_abs_step_conf": 0.766987681388855,
"adv/ratio_final_to_reasoning": 1.109323865184883,
"adv/ratio_step_to_reasoning": 1.2606091990574173,
"adv/std_final_conf": 0.8604558706283569,
"adv/std_reasoning": 0.8266105651855469,
"adv/std_step_conf": 0.936232328414917,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7114664082687339,
"calib/avg_num_step_conf": 4.7421875,
"calib/ece": 0.2897710843373495,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5421686746987951,
"calib/gap": 0.36726356589147285,
"calib/mean_conf": 0.5802690763052208,
"calib/mu_c": 0.7572635658914728,
"calib/mu_w": 0.38999999999999996,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17598393574297197,
"calib/std_conf": 0.4717101714303749,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5031260550458716,
"calib/step_q_c_n": 545.0,
"calib/step_q_gap": 0.1928853973478148,
"calib/step_q_w": 0.3102406576980568,
"calib/step_q_w_n": 669.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2343.0,
"completions/max_terminated_length": 2343.0,
"completions/mean_length": 446.85546875,
"completions/mean_terminated_length": 448.6078796386719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.0584404356777668,
"kl": 0.150634765625,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0536,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03910226374864578,
"mask/share_reasoning": 0.8406857252120972,
"mask/share_step_conf": 0.11630573123693466,
"num_tokens": 35827247.0,
"reward": 1.294600009918213,
"reward_std": 0.3159686028957367,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6879982352256775,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.802944540977478,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.5800888538360596,
"adv/mean_abs_reasoning": 0.4550928473472595,
"adv/mean_abs_step_conf": 0.7868247032165527,
"adv/ratio_final_to_reasoning": 1.2746604505375176,
"adv/ratio_step_to_reasoning": 1.7289322559186797,
"adv/std_final_conf": 0.8195415139198303,
"adv/std_reasoning": 0.7391253709793091,
"adv/std_step_conf": 0.9356487989425659,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7142903645833334,
"calib/avg_num_step_conf": 4.609375,
"calib/ece": 0.2741796874999999,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.5234375,
"calib/gap": 0.37472916666666667,
"calib/mean_conf": 0.5803515625,
"calib/mu_c": 0.720875,
"calib/mu_w": 0.3461458333333334,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11476562499999993,
"calib/std_conf": 0.46237442703809717,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.48515708274894814,
"calib/step_q_c_n": 713.0,
"calib/step_q_gap": 0.15993010202089675,
"calib/step_q_w": 0.3252269807280514,
"calib/step_q_w_n": 467.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1050.0,
"completions/max_terminated_length": 1050.0,
"completions/mean_length": 380.1796875,
"completions/mean_terminated_length": 381.6706237792969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.04697565361857414,
"kl": 0.16094970703125,
"learning_rate": 1.0833333333333335e-06,
"loss": 0.0567,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.045005638152360916,
"mask/share_reasoning": 0.8235310912132263,
"mask/share_step_conf": 0.12755702435970306,
"num_tokens": 36028493.0,
"reward": 1.370754361152649,
"reward_std": 0.22566889226436615,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7254956960678101,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8455064296722412,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.683541476726532,
"adv/mean_abs_reasoning": 0.5417003035545349,
"adv/mean_abs_step_conf": 0.7814807891845703,
"adv/ratio_final_to_reasoning": 1.2618443671551633,
"adv/ratio_step_to_reasoning": 1.4426441780752959,
"adv/std_final_conf": 0.8726164102554321,
"adv/std_reasoning": 0.7753661870956421,
"adv/std_step_conf": 0.9360226392745972,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6877846659919028,
"calib/avg_num_step_conf": 4.34765625,
"calib/ece": 0.29037890625,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.61328125,
"calib/gap": 0.3148466599190283,
"calib/mean_conf": 0.65248828125,
"calib/mu_c": 0.7803947368421053,
"calib/mu_w": 0.4655480769230769,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17455859375000002,
"calib/std_conf": 0.44877169152328544,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.512444099378882,
"calib/step_q_c_n": 644.0,
"calib/step_q_gap": 0.10413066654306113,
"calib/step_q_w": 0.4083134328358209,
"calib/step_q_w_n": 469.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 951.0,
"completions/max_terminated_length": 951.0,
"completions/mean_length": 377.95703125,
"completions/mean_terminated_length": 379.4392395019531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.1728,
"grad_norm": 0.06931810081005096,
"kl": 0.1662750244140625,
"learning_rate": 1.0555555555555557e-06,
"loss": -0.0313,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.043304286897182465,
"mask/share_reasoning": 0.8308358788490295,
"mask/share_step_conf": 0.12195360660552979,
"num_tokens": 36229394.0,
"reward": 1.334316372871399,
"reward_std": 0.26881474256515503,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.70583176612854,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8220254182815552,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.5971531271934509,
"adv/mean_abs_reasoning": 0.5205508470535278,
"adv/mean_abs_step_conf": 0.7498682737350464,
"adv/ratio_final_to_reasoning": 1.1471561915104254,
"adv/ratio_step_to_reasoning": 1.440528390222633,
"adv/std_final_conf": 0.8285160064697266,
"adv/std_reasoning": 0.7576987743377686,
"adv/std_step_conf": 0.9356043338775635,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7737785218253969,
"calib/avg_num_step_conf": 4.609375,
"calib/ece": 0.23511811023622053,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.515748031496063,
"calib/gap": 0.4542906746031746,
"calib/mean_conf": 0.5743307086614173,
"calib/mu_c": 0.7996875,
"calib/mu_w": 0.34539682539682537,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15275590551181112,
"calib/std_conf": 0.460342376096538,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5099199288256229,
"calib/step_q_c_n": 562.0,
"calib/step_q_gap": 0.1882223560100889,
"calib/step_q_w": 0.32169757281553396,
"calib/step_q_w_n": 618.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2094.0,
"completions/max_terminated_length": 2094.0,
"completions/mean_length": 449.578125,
"completions/mean_terminated_length": 449.578125,
"completions/min_length": 109.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.047132719308137894,
"kl": 0.14910888671875,
"learning_rate": 1.0277777777777777e-06,
"loss": 0.052,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04032721370458603,
"mask/share_reasoning": 0.8429751396179199,
"mask/share_step_conf": 0.11669766902923584,
"num_tokens": 36449318.0,
"reward": 1.3833903074264526,
"reward_std": 0.24456867575645447,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.7504304647445679,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8597375154495239,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.7085994482040405,
"adv/mean_abs_reasoning": 0.5715576410293579,
"adv/mean_abs_step_conf": 0.7559859752655029,
"adv/ratio_final_to_reasoning": 1.2397690054985084,
"adv/ratio_step_to_reasoning": 1.3226767013454588,
"adv/std_final_conf": 0.8809927701950073,
"adv/std_reasoning": 0.8098272085189819,
"adv/std_step_conf": 0.9359097480773926,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7224260055653933,
"calib/avg_num_step_conf": 4.83203125,
"calib/ece": 0.28141269841269845,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.503968253968254,
"calib/gap": 0.34523956488742724,
"calib/mean_conf": 0.5793015873015873,
"calib/mu_c": 0.7628813559322034,
"calib/mu_w": 0.41764179104477617,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19623015873015875,
"calib/std_conf": 0.4523486025748822,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4881397338403042,
"calib/step_q_c_n": 526.0,
"calib/step_q_gap": 0.1514336860203323,
"calib/step_q_w": 0.3367060478199719,
"calib/step_q_w_n": 711.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1971.0,
"completions/max_terminated_length": 1971.0,
"completions/mean_length": 483.765625,
"completions/mean_terminated_length": 485.66278076171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.05917629599571228,
"kl": 0.139862060546875,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0461,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03503939509391785,
"mask/share_reasoning": 0.8546115159988403,
"mask/share_step_conf": 0.10644285380840302,
"num_tokens": 36679298.0,
"reward": 1.325966238975525,
"reward_std": 0.25556206703186035,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.6949499249458313,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8339599370956421,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.6125674843788147,
"adv/mean_abs_reasoning": 0.5041183233261108,
"adv/mean_abs_step_conf": 0.7621119022369385,
"adv/ratio_final_to_reasoning": 1.2151264019470065,
"adv/ratio_step_to_reasoning": 1.511771873731186,
"adv/std_final_conf": 0.8267259001731873,
"adv/std_reasoning": 0.775274395942688,
"adv/std_step_conf": 0.9353328943252563,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7095137943832734,
"calib/avg_num_step_conf": 4.76953125,
"calib/ece": 0.2916470588235295,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5333333333333333,
"calib/gap": 0.3677483607571445,
"calib/mean_conf": 0.5851372549019607,
"calib/mu_c": 0.7827118644067795,
"calib/mu_w": 0.414963503649635,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20701960784313736,
"calib/std_conf": 0.4624401188413837,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5051555555555556,
"calib/step_q_c_n": 549.0,
"calib/step_q_gap": 0.14120972222222228,
"calib/step_q_w": 0.3639458333333333,
"calib/step_q_w_n": 672.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1347.0,
"completions/max_terminated_length": 1347.0,
"completions/mean_length": 456.3671875,
"completions/mean_terminated_length": 458.1568908691406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.176,
"grad_norm": 0.05504642054438591,
"kl": 0.1468353271484375,
"learning_rate": 9.722222222222224e-07,
"loss": -0.0093,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03639739751815796,
"mask/share_reasoning": 0.8458707332611084,
"mask/share_step_conf": 0.11382567137479782,
"num_tokens": 36901704.0,
"reward": 1.3412384986877441,
"reward_std": 0.2222728580236435,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.7026550769805908,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8442078232765198,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.5281888246536255,
"adv/mean_abs_reasoning": 0.47246187925338745,
"adv/mean_abs_step_conf": 0.7369155287742615,
"adv/ratio_final_to_reasoning": 1.1179501412649442,
"adv/ratio_step_to_reasoning": 1.5597354223345585,
"adv/std_final_conf": 0.7823854684829712,
"adv/std_reasoning": 0.7392469048500061,
"adv/std_step_conf": 0.9355576634407043,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8042750694619853,
"calib/avg_num_step_conf": 4.87890625,
"calib/ece": 0.19105882352941178,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5764705882352941,
"calib/gap": 0.5152008082849204,
"calib/mean_conf": 0.6321960784313726,
"calib/mu_c": 0.8483783783783784,
"calib/mu_w": 0.33317757009345794,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12143137254901962,
"calib/std_conf": 0.44897539212045834,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5243108728943339,
"calib/step_q_c_n": 653.0,
"calib/step_q_gap": 0.2076749668540654,
"calib/step_q_w": 0.3166359060402685,
"calib/step_q_w_n": 596.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1831.0,
"completions/max_terminated_length": 1831.0,
"completions/mean_length": 464.94140625,
"completions/mean_terminated_length": 464.94140625,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.0417521707713604,
"kl": 0.13323974609375,
"learning_rate": 9.444444444444445e-07,
"loss": 0.0596,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03702709823846817,
"mask/share_reasoning": 0.8455966114997864,
"mask/share_step_conf": 0.11737628281116486,
"num_tokens": 37126913.0,
"reward": 1.4266111850738525,
"reward_std": 0.20052561163902283,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7997732162475586,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8696931600570679,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.544503927230835,
"adv/mean_abs_reasoning": 0.5453364253044128,
"adv/mean_abs_step_conf": 0.758324921131134,
"adv/ratio_final_to_reasoning": 0.998473422946004,
"adv/ratio_step_to_reasoning": 1.3905634869481323,
"adv/std_final_conf": 0.79471755027771,
"adv/std_reasoning": 0.7754924893379211,
"adv/std_step_conf": 0.9359537363052368,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6856980400026758,
"calib/avg_num_step_conf": 4.7578125,
"calib/ece": 0.270752,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.684,
"calib/gap": 0.32092367382433606,
"calib/mean_conf": 0.7328479999999999,
"calib/mu_c": 0.859933774834437,
"calib/mu_w": 0.5390101010101009,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1998,
"calib/std_conf": 0.4121252053636128,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5010471771428572,
"calib/step_q_c_n": 700.0,
"calib/step_q_gap": 0.0949255555212356,
"calib/step_q_w": 0.4061216216216216,
"calib/step_q_w_n": 518.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2809.0,
"completions/max_terminated_length": 2809.0,
"completions/mean_length": 435.234375,
"completions/mean_terminated_length": 438.6614074707031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.039866652339696884,
"kl": 0.1396331787109375,
"learning_rate": 9.166666666666666e-07,
"loss": -0.0156,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03890000283718109,
"mask/share_reasoning": 0.8367317318916321,
"mask/share_step_conf": 0.11655577272176743,
"num_tokens": 37343941.0,
"reward": 1.3247613906860352,
"reward_std": 0.2827262878417969,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7072490453720093,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8144960403442383,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.5947244167327881,
"adv/mean_abs_reasoning": 0.5478661060333252,
"adv/mean_abs_step_conf": 0.7788569927215576,
"adv/ratio_final_to_reasoning": 1.0855287636585655,
"adv/ratio_step_to_reasoning": 1.4216192316780807,
"adv/std_final_conf": 0.7944765686988831,
"adv/std_reasoning": 0.7754152417182922,
"adv/std_step_conf": 0.9360412359237671,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7478191911181602,
"calib/avg_num_step_conf": 4.9375,
"calib/ece": 0.23316205533596834,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.6284584980237155,
"calib/gap": 0.3987205921226539,
"calib/mean_conf": 0.6853359683794465,
"calib/mu_c": 0.8382051282051282,
"calib/mu_w": 0.43948453608247423,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15094861660079048,
"calib/std_conf": 0.43243540696431676,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4778477653631285,
"calib/step_q_c_n": 716.0,
"calib/step_q_gap": 0.15568900623904092,
"calib/step_q_w": 0.3221587591240876,
"calib/step_q_w_n": 548.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2381.0,
"completions/max_terminated_length": 2381.0,
"completions/mean_length": 465.28515625,
"completions/mean_terminated_length": 468.9488220214844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.1792,
"grad_norm": 0.03753136098384857,
"kl": 0.140655517578125,
"learning_rate": 8.88888888888889e-07,
"loss": -0.0185,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.037122730165719986,
"mask/share_reasoning": 0.8412609696388245,
"mask/share_step_conf": 0.11380381882190704,
"num_tokens": 37567726.0,
"reward": 1.38808012008667,
"reward_std": 0.24945136904716492,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7514784932136536,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8525751233100891,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.6367510557174683,
"adv/mean_abs_reasoning": 0.5308976173400879,
"adv/mean_abs_step_conf": 0.7567160129547119,
"adv/ratio_final_to_reasoning": 1.1993857853567493,
"adv/ratio_step_to_reasoning": 1.4253520608098094,
"adv/std_final_conf": 0.8464077711105347,
"adv/std_reasoning": 0.7754961252212524,
"adv/std_step_conf": 0.9359106421470642,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7465494422386084,
"calib/avg_num_step_conf": 4.28515625,
"calib/ece": 0.24952380952380956,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6031746031746031,
"calib/gap": 0.4017167706560787,
"calib/mean_conf": 0.6587301587301587,
"calib/mu_c": 0.8548062015503877,
"calib/mu_w": 0.453089430894309,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.19817460317460323,
"calib/std_conf": 0.43882867924210656,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5423281690140845,
"calib/step_q_c_n": 568.0,
"calib/step_q_gap": 0.179225522511249,
"calib/step_q_w": 0.3631026465028355,
"calib/step_q_w_n": 529.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2718.0,
"completions/max_terminated_length": 2718.0,
"completions/mean_length": 447.34375,
"completions/mean_terminated_length": 450.86614990234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.04010246694087982,
"kl": 0.15643310546875,
"learning_rate": 8.611111111111112e-07,
"loss": -0.0581,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.037801437079906464,
"mask/share_reasoning": 0.8434059619903564,
"mask/share_step_conf": 0.11098004877567291,
"num_tokens": 37786430.0,
"reward": 1.3459802865982056,
"reward_std": 0.28642362356185913,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7174351215362549,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8392157554626465,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.6127386093139648,
"adv/mean_abs_reasoning": 0.4825683832168579,
"adv/mean_abs_step_conf": 0.7408894300460815,
"adv/ratio_final_to_reasoning": 1.2697446219525963,
"adv/ratio_step_to_reasoning": 1.5353045408968258,
"adv/std_final_conf": 0.8355098962783813,
"adv/std_reasoning": 0.7394052147865295,
"adv/std_step_conf": 0.9359559416770935,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7182885906040268,
"calib/avg_num_step_conf": 4.86328125,
"calib/ece": 0.23140562248995986,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6626506024096386,
"calib/gap": 0.3948080536912752,
"calib/mean_conf": 0.7026506024096385,
"calib/mu_c": 0.8612080536912752,
"calib/mu_w": 0.4664,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16783132530120484,
"calib/std_conf": 0.42632006474014184,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5080845481049563,
"calib/step_q_c_n": 686.0,
"calib/step_q_gap": 0.18208097028742498,
"calib/step_q_w": 0.3260035778175313,
"calib/step_q_w_n": 559.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2739.0,
"completions/max_terminated_length": 2739.0,
"completions/mean_length": 485.921875,
"completions/mean_terminated_length": 487.8274841308594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.04000094532966614,
"kl": 0.127960205078125,
"learning_rate": 8.333333333333333e-07,
"loss": -0.0138,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03573903813958168,
"mask/share_reasoning": 0.8482216000556946,
"mask/share_step_conf": 0.11213310807943344,
"num_tokens": 38014978.0,
"reward": 1.3481841087341309,
"reward_std": 0.29889148473739624,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7322218418121338,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8269951343536377,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.675028383731842,
"adv/mean_abs_reasoning": 0.5765197277069092,
"adv/mean_abs_step_conf": 0.7273029685020447,
"adv/ratio_final_to_reasoning": 1.1708677973896022,
"adv/ratio_step_to_reasoning": 1.2615404704273894,
"adv/std_final_conf": 0.8604573011398315,
"adv/std_reasoning": 0.8098583817481995,
"adv/std_step_conf": 0.9359089732170105,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6691556395715186,
"calib/avg_num_step_conf": 4.59375,
"calib/ece": 0.3261739130434782,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5612648221343873,
"calib/gap": 0.282,
"calib/mean_conf": 0.6241818181818182,
"calib/mu_c": 0.778,
"calib/mu_w": 0.49600000000000005,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2479051383399209,
"calib/std_conf": 0.4454292224793872,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5166305220883535,
"calib/step_q_c_n": 498.0,
"calib/step_q_gap": 0.16900677577566914,
"calib/step_q_w": 0.34762374631268433,
"calib/step_q_w_n": 678.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1813.0,
"completions/max_terminated_length": 1813.0,
"completions/mean_length": 458.9140625,
"completions/mean_terminated_length": 458.9140625,
"completions/min_length": 100.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.1824,
"grad_norm": 0.04507390782237053,
"kl": 0.14056396484375,
"learning_rate": 8.055555555555557e-07,
"loss": -0.0121,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03748851642012596,
"mask/share_reasoning": 0.8515720963478088,
"mask/share_step_conf": 0.1109393835067749,
"num_tokens": 38239356.0,
"reward": 1.3258652687072754,
"reward_std": 0.2702135443687439,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.6569274663925171,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8532608151435852,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.6522042751312256,
"adv/mean_abs_reasoning": 0.5203927755355835,
"adv/mean_abs_step_conf": 0.7619109153747559,
"adv/ratio_final_to_reasoning": 1.2532923318544975,
"adv/ratio_step_to_reasoning": 1.464107403471549,
"adv/std_final_conf": 0.8345689177513123,
"adv/std_reasoning": 0.7753735780715942,
"adv/std_step_conf": 0.9356265068054199,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7008541600759253,
"calib/avg_num_step_conf": 4.453125,
"calib/ece": 0.26681102362204734,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6850393700787402,
"calib/gap": 0.31271306548560585,
"calib/mean_conf": 0.7587007874015748,
"calib/mu_c": 0.892896551724138,
"calib/mu_w": 0.5801834862385321,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22732283464566938,
"calib/std_conf": 0.3834585219171378,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5435008665511265,
"calib/step_q_c_n": 577.0,
"calib/step_q_gap": 0.1564386995884267,
"calib/step_q_w": 0.3870621669626998,
"calib/step_q_w_n": 563.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1862.0,
"completions/max_terminated_length": 1862.0,
"completions/mean_length": 431.22265625,
"completions/mean_terminated_length": 432.91375732421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.0644737184047699,
"kl": 0.1463623046875,
"learning_rate": 7.777777777777779e-07,
"loss": 0.0133,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03934334218502045,
"mask/share_reasoning": 0.8426755666732788,
"mask/share_step_conf": 0.11407487094402313,
"num_tokens": 38453101.0,
"reward": 1.3807896375656128,
"reward_std": 0.2652018070220947,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7202441692352295,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8648082613945007,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.6737043857574463,
"adv/mean_abs_reasoning": 0.5796314477920532,
"adv/mean_abs_step_conf": 0.7727189660072327,
"adv/ratio_final_to_reasoning": 1.1622978503387593,
"adv/ratio_step_to_reasoning": 1.3331211909752194,
"adv/std_final_conf": 0.8583126664161682,
"adv/std_reasoning": 0.792940616607666,
"adv/std_step_conf": 0.9362257122993469,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6987571762033941,
"calib/avg_num_step_conf": 4.921875,
"calib/ece": 0.27857142857142864,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6865079365079365,
"calib/gap": 0.34207936407797623,
"calib/mean_conf": 0.7465873015873016,
"calib/mu_c": 0.9108396946564886,
"calib/mu_w": 0.5687603305785124,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.25265873015873025,
"calib/std_conf": 0.3980610998374869,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5129598755832038,
"calib/step_q_c_n": 643.0,
"calib/step_q_gap": 0.09447132885170734,
"calib/step_q_w": 0.41848854673149644,
"calib/step_q_w_n": 617.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2279.0,
"completions/max_terminated_length": 2279.0,
"completions/mean_length": 484.59765625,
"completions/mean_terminated_length": 484.59765625,
"completions/min_length": 88.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.046825289726257324,
"kl": 0.135772705078125,
"learning_rate": 7.5e-07,
"loss": 0.06,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03792772442102432,
"mask/share_reasoning": 0.8422991037368774,
"mask/share_step_conf": 0.11977314949035645,
"num_tokens": 38680318.0,
"reward": 1.3218117952346802,
"reward_std": 0.29576268792152405,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7001835703849792,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8221105337142944,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.7295100092887878,
"adv/mean_abs_reasoning": 0.5894637107849121,
"adv/mean_abs_step_conf": 0.7436856031417847,
"adv/ratio_final_to_reasoning": 1.237582561812659,
"adv/ratio_step_to_reasoning": 1.2616308511197667,
"adv/std_final_conf": 0.907024621963501,
"adv/std_reasoning": 0.8264725208282471,
"adv/std_step_conf": 0.9360268115997314,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6277687398323113,
"calib/avg_num_step_conf": 4.7890625,
"calib/ece": 0.3521739130434783,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5968379446640316,
"calib/gap": 0.22327430859717168,
"calib/mean_conf": 0.6431620553359684,
"calib/mu_c": 0.7587704918032786,
"calib/mu_w": 0.5354961832061069,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.25656126482213437,
"calib/std_conf": 0.4448576196956833,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4367843137254902,
"calib/step_q_c_n": 561.0,
"calib/step_q_gap": 0.053985316231806024,
"calib/step_q_w": 0.3827989974936842,
"calib/step_q_w_n": 665.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2366.0,
"completions/max_terminated_length": 2366.0,
"completions/mean_length": 460.16015625,
"completions/mean_terminated_length": 461.9647216796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.1856,
"grad_norm": 0.04496309533715248,
"kl": 0.1407012939453125,
"learning_rate": 7.222222222222222e-07,
"loss": 0.0457,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.035832107067108154,
"mask/share_reasoning": 0.8494890332221985,
"mask/share_step_conf": 0.11077260971069336,
"num_tokens": 38902351.0,
"reward": 1.2767741680145264,
"reward_std": 0.28168269991874695,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6305328011512756,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8150234222412109,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.6855437755584717,
"adv/mean_abs_reasoning": 0.4921633005142212,
"adv/mean_abs_step_conf": 0.7646262049674988,
"adv/ratio_final_to_reasoning": 1.3929193315352912,
"adv/ratio_step_to_reasoning": 1.5536026440179578,
"adv/std_final_conf": 0.8800501823425293,
"adv/std_reasoning": 0.7576267719268799,
"adv/std_step_conf": 0.9361082315444946,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7554347826086957,
"calib/avg_num_step_conf": 5.03125,
"calib/ece": 0.2831075697211155,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.4860557768924303,
"calib/gap": 0.430406070549631,
"calib/mean_conf": 0.5675697211155378,
"calib/mu_c": 0.8402173913043479,
"calib/mu_w": 0.40981132075471693,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.24207171314741033,
"calib/std_conf": 0.45303499565746624,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5376992665036674,
"calib/step_q_c_n": 409.0,
"calib/step_q_gap": 0.23311519369365602,
"calib/step_q_w": 0.3045840728100114,
"calib/step_q_w_n": 879.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2968.0,
"completions/max_terminated_length": 2968.0,
"completions/mean_length": 467.69921875,
"completions/mean_terminated_length": 469.5333557128906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.14519743621349335,
"kl": 0.2266845703125,
"learning_rate": 6.944444444444446e-07,
"loss": -0.0928,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03772320598363876,
"mask/share_reasoning": 0.8381674289703369,
"mask/share_step_conf": 0.12020306289196014,
"num_tokens": 39127906.0,
"reward": 1.3145315647125244,
"reward_std": 0.2927352488040924,
"rewards/accuracy_reward_step": 0.359375,
"rewards/final_brier_reward_step": 0.7042621374130249,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8288068175315857,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.619603157043457,
"adv/mean_abs_reasoning": 0.39660730957984924,
"adv/mean_abs_step_conf": 0.7484374642372131,
"adv/ratio_final_to_reasoning": 1.56225854157817,
"adv/ratio_step_to_reasoning": 1.8870995217664532,
"adv/std_final_conf": 0.8269156813621521,
"adv/std_reasoning": 0.6816341876983643,
"adv/std_step_conf": 0.9356382489204407,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7637772675086107,
"calib/avg_num_step_conf": 4.60546875,
"calib/ece": 0.2599601593625498,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5856573705179283,
"calib/gap": 0.4572847301951782,
"calib/mean_conf": 0.6374103585657371,
"calib/mu_c": 0.8815384615384617,
"calib/mu_w": 0.42425373134328354,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21561752988047805,
"calib/std_conf": 0.45065166065959406,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.561765784114053,
"calib/step_q_c_n": 491.0,
"calib/step_q_gap": 0.24466840039312276,
"calib/step_q_w": 0.3170973837209302,
"calib/step_q_w_n": 688.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2726.0,
"completions/max_terminated_length": 2726.0,
"completions/mean_length": 465.328125,
"completions/mean_terminated_length": 467.1529541015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.04847950115799904,
"kl": 0.132049560546875,
"learning_rate": 6.666666666666667e-07,
"loss": -0.0013,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.0396907776594162,
"mask/share_reasoning": 0.8393638134002686,
"mask/share_step_conf": 0.11703912168741226,
"num_tokens": 39351094.0,
"reward": 1.3537293672561646,
"reward_std": 0.27692878246307373,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.7317417860031128,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8441084623336792,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.6589510440826416,
"adv/mean_abs_reasoning": 0.3878467082977295,
"adv/mean_abs_step_conf": 0.7780240178108215,
"adv/ratio_final_to_reasoning": 1.6989986764997875,
"adv/ratio_step_to_reasoning": 2.0060090782401936,
"adv/std_final_conf": 0.8572041988372803,
"adv/std_reasoning": 0.6612958908081055,
"adv/std_step_conf": 0.935721755027771,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7402791625124627,
"calib/avg_num_step_conf": 4.90625,
"calib/ece": 0.25370078740157476,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5354330708661418,
"calib/gap": 0.40251246261216345,
"calib/mean_conf": 0.5934645669291339,
"calib/mu_c": 0.8089830508474576,
"calib/mu_w": 0.40647058823529414,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19129921259842517,
"calib/std_conf": 0.45309066022408007,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5174404761904763,
"calib/step_q_c_n": 504.0,
"calib/step_q_gap": 0.21048036980749757,
"calib/step_q_w": 0.3069601063829787,
"calib/step_q_w_n": 752.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2066.0,
"completions/max_terminated_length": 2066.0,
"completions/mean_length": 464.4140625,
"completions/mean_terminated_length": 464.4140625,
"completions/min_length": 107.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.1888,
"grad_norm": 0.09088198095560074,
"kl": 0.1372833251953125,
"learning_rate": 6.388888888888889e-07,
"loss": -0.0281,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0373879075050354,
"mask/share_reasoning": 0.8458495140075684,
"mask/share_step_conf": 0.11676257103681564,
"num_tokens": 39573816.0,
"reward": 1.3609651327133179,
"reward_std": 0.24577787518501282,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.7238953113555908,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8537049293518066,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.6388211250305176,
"adv/mean_abs_reasoning": 0.6201913952827454,
"adv/mean_abs_step_conf": 0.7540395259857178,
"adv/ratio_final_to_reasoning": 1.0300386782039743,
"adv/ratio_step_to_reasoning": 1.2158174584830397,
"adv/std_final_conf": 0.8600205779075623,
"adv/std_reasoning": 0.8266543745994568,
"adv/std_step_conf": 0.9360244274139404,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8161231884057971,
"calib/avg_num_step_conf": 4.8359375,
"calib/ece": 0.180728,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.556,
"calib/gap": 0.5187885610766045,
"calib/mean_conf": 0.6250319999999999,
"calib/mu_c": 0.8574492753623189,
"calib/mu_w": 0.33866071428571426,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.12688,
"calib/std_conf": 0.44586524306790276,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4992134831460674,
"calib/step_q_c_n": 623.0,
"calib/step_q_gap": 0.19817445875582346,
"calib/step_q_w": 0.30103902439024394,
"calib/step_q_w_n": 615.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2657.0,
"completions/max_terminated_length": 2657.0,
"completions/mean_length": 444.16015625,
"completions/mean_terminated_length": 449.4269104003906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.054977044463157654,
"kl": 0.1357879638671875,
"learning_rate": 6.111111111111112e-07,
"loss": -0.0295,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.037683069705963135,
"mask/share_reasoning": 0.8318748474121094,
"mask/share_step_conf": 0.11872333288192749,
"num_tokens": 39793593.0,
"reward": 1.3868026733398438,
"reward_std": 0.2786443829536438,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7862921953201294,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8420940637588501,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.6645340919494629,
"adv/mean_abs_reasoning": 0.5819056630134583,
"adv/mean_abs_step_conf": 0.7427074909210205,
"adv/ratio_final_to_reasoning": 1.1419962619165878,
"adv/ratio_step_to_reasoning": 1.2763365922146788,
"adv/std_final_conf": 0.8761852979660034,
"adv/std_reasoning": 0.8265078663825989,
"adv/std_step_conf": 0.9360561966896057,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7403410852713178,
"calib/avg_num_step_conf": 4.1875,
"calib/ece": 0.25804724409448815,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5393700787401575,
"calib/gap": 0.4028840930232559,
"calib/mean_conf": 0.5851023622047243,
"calib/mu_c": 0.7833720930232558,
"calib/mu_w": 0.38048799999999994,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.16763779527559053,
"calib/std_conf": 0.45991964951518216,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5248474903474903,
"calib/step_q_c_n": 518.0,
"calib/step_q_gap": 0.17961283330777916,
"calib/step_q_w": 0.3452346570397112,
"calib/step_q_w_n": 554.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1245.0,
"completions/max_terminated_length": 1245.0,
"completions/mean_length": 421.703125,
"completions/mean_terminated_length": 425.02362060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.04853654280304909,
"kl": 0.149322509765625,
"learning_rate": 5.833333333333334e-07,
"loss": -0.0655,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03802802786231041,
"mask/share_reasoning": 0.8477430939674377,
"mask/share_step_conf": 0.10641638189554214,
"num_tokens": 40007813.0,
"reward": 1.3584095239639282,
"reward_std": 0.2803743779659271,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7246987819671631,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8468413352966309,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.6793646812438965,
"adv/mean_abs_reasoning": 0.5431982278823853,
"adv/mean_abs_step_conf": 0.7438191771507263,
"adv/ratio_final_to_reasoning": 1.250675437385621,
"adv/ratio_step_to_reasoning": 1.3693328493549137,
"adv/std_final_conf": 0.8664782643318176,
"adv/std_reasoning": 0.7928855419158936,
"adv/std_step_conf": 0.9362828135490417,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7301007882155317,
"calib/avg_num_step_conf": 4.73828125,
"calib/ece": 0.2398446215139441,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.549800796812749,
"calib/gap": 0.37958831890425127,
"calib/mean_conf": 0.6286095617529881,
"calib/mu_c": 0.7934507042253521,
"calib/mu_w": 0.41386238532110087,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1513585657370517,
"calib/std_conf": 0.4365410753940372,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.46762560777957857,
"calib/step_q_c_n": 617.0,
"calib/step_q_gap": 0.13475501954428443,
"calib/step_q_w": 0.33287058823529414,
"calib/step_q_w_n": 595.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3047.0,
"completions/max_terminated_length": 3047.0,
"completions/mean_length": 493.0625,
"completions/mean_terminated_length": 494.99609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.192,
"grad_norm": 0.04936710745096207,
"kl": 0.1364593505859375,
"learning_rate": 5.555555555555555e-07,
"loss": -0.0353,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.035570383071899414,
"mask/share_reasoning": 0.8538831472396851,
"mask/share_step_conf": 0.10664021968841553,
"num_tokens": 40237893.0,
"reward": 1.330967903137207,
"reward_std": 0.3095991909503937,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7201747894287109,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8185367584228516,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.6676289439201355,
"adv/mean_abs_reasoning": 0.5264783501625061,
"adv/mean_abs_step_conf": 0.7492181062698364,
"adv/ratio_final_to_reasoning": 1.2681033203246836,
"adv/ratio_step_to_reasoning": 1.4230748634555972,
"adv/std_final_conf": 0.8744766116142273,
"adv/std_reasoning": 0.7928068041801453,
"adv/std_step_conf": 0.9359700083732605,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7649448818897637,
"calib/avg_num_step_conf": 4.53515625,
"calib/ece": 0.23920238095238094,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4444444444444444,
"calib/gap": 0.3926706771653545,
"calib/mean_conf": 0.5361944444444445,
"calib/mu_c": 0.7340880000000001,
"calib/mu_w": 0.3414173228346456,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13968253968253969,
"calib/std_conf": 0.44734548397477747,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4891968085106383,
"calib/step_q_c_n": 564.0,
"calib/step_q_gap": 0.17155861755586443,
"calib/step_q_w": 0.31763819095477386,
"calib/step_q_w_n": 597.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3010.0,
"completions/max_terminated_length": 3010.0,
"completions/mean_length": 441.85546875,
"completions/mean_terminated_length": 443.5882568359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.047045283019542694,
"kl": 0.1700897216796875,
"learning_rate": 5.277777777777779e-07,
"loss": 0.051,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.037868089973926544,
"mask/share_reasoning": 0.8394981622695923,
"mask/share_step_conf": 0.11872752755880356,
"num_tokens": 40457272.0,
"reward": 1.3582584857940674,
"reward_std": 0.2621048092842102,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7291232347488403,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8468218445777893,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.565575897693634,
"adv/mean_abs_reasoning": 0.3998072147369385,
"adv/mean_abs_step_conf": 0.7626135945320129,
"adv/ratio_final_to_reasoning": 1.4146215396982431,
"adv/ratio_step_to_reasoning": 1.907453308549698,
"adv/std_final_conf": 0.799468994140625,
"adv/std_reasoning": 0.68152916431427,
"adv/std_step_conf": 0.9355183243751526,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7179000835851604,
"calib/avg_num_step_conf": 4.6953125,
"calib/ece": 0.25555511811023623,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.610236220472441,
"calib/gap": 0.3693243104224266,
"calib/mean_conf": 0.6671220472440945,
"calib/mu_c": 0.8168874172185431,
"calib/mu_w": 0.4475631067961165,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16409448818897643,
"calib/std_conf": 0.4407465803572345,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4907097744360902,
"calib/step_q_c_n": 665.0,
"calib/step_q_gap": 0.1647898489239859,
"calib/step_q_w": 0.3259199255121043,
"calib/step_q_w_n": 537.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2494.0,
"completions/max_terminated_length": 2494.0,
"completions/mean_length": 456.72265625,
"completions/mean_terminated_length": 456.72265625,
"completions/min_length": 135.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.046485915780067444,
"kl": 0.14080810546875,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0515,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03678284212946892,
"mask/share_reasoning": 0.8472731113433838,
"mask/share_step_conf": 0.11594408005475998,
"num_tokens": 40680353.0,
"reward": 1.3809163570404053,
"reward_std": 0.21928508579730988,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7317008376121521,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8568628430366516,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.6731914281845093,
"adv/mean_abs_reasoning": 0.6058062314987183,
"adv/mean_abs_step_conf": 0.758799135684967,
"adv/ratio_final_to_reasoning": 1.1112322607165748,
"adv/ratio_step_to_reasoning": 1.2525442892981737,
"adv/std_final_conf": 0.8595358729362488,
"adv/std_reasoning": 0.8099452257156372,
"adv/std_step_conf": 0.9361091256141663,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.637114951164538,
"calib/avg_num_step_conf": 4.2109375,
"calib/ece": 0.3296047430830039,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5177865612648221,
"calib/gap": 0.25068870523415987,
"calib/mean_conf": 0.5725296442687747,
"calib/mu_c": 0.6924242424242425,
"calib/mu_w": 0.4417355371900826,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19019762845849808,
"calib/std_conf": 0.46056238618867934,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4695128676470588,
"calib/step_q_c_n": 544.0,
"calib/step_q_gap": 0.10510462794668429,
"calib/step_q_w": 0.3644082397003745,
"calib/step_q_w_n": 534.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2079.0,
"completions/max_terminated_length": 2079.0,
"completions/mean_length": 459.60546875,
"completions/mean_terminated_length": 459.60546875,
"completions/min_length": 132.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.1952,
"grad_norm": 0.06715097278356552,
"kl": 0.1397247314453125,
"learning_rate": 4.7222222222222226e-07,
"loss": 0.0486,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.037804316729307175,
"mask/share_reasoning": 0.859054446220398,
"mask/share_step_conf": 0.10314127802848816,
"num_tokens": 40904692.0,
"reward": 1.300488829612732,
"reward_std": 0.2849113345146179,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6523257493972778,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8243258595466614,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.70212721824646,
"adv/mean_abs_reasoning": 0.5660622119903564,
"adv/mean_abs_step_conf": 0.7311906218528748,
"adv/ratio_final_to_reasoning": 1.240371117120995,
"adv/ratio_step_to_reasoning": 1.2917142433548126,
"adv/std_final_conf": 0.8754715919494629,
"adv/std_reasoning": 0.8265897631645203,
"adv/std_step_conf": 0.9362109899520874,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.737949829261886,
"calib/avg_num_step_conf": 4.71875,
"calib/ece": 0.23835341365461854,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.6184738955823293,
"calib/gap": 0.4285973207249802,
"calib/mean_conf": 0.6523293172690763,
"calib/mu_c": 0.8382269503546099,
"calib/mu_w": 0.40962962962962973,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16220883534136554,
"calib/std_conf": 0.44985606353090074,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5141587561374794,
"calib/step_q_c_n": 611.0,
"calib/step_q_gap": 0.18701972766176755,
"calib/step_q_w": 0.32713902847571186,
"calib/step_q_w_n": 597.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2596.0,
"completions/max_terminated_length": 2596.0,
"completions/mean_length": 445.72265625,
"completions/mean_terminated_length": 449.2322692871094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.03923555091023445,
"kl": 0.149627685546875,
"learning_rate": 4.444444444444445e-07,
"loss": -0.0426,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03696506842970848,
"mask/share_reasoning": 0.8455117344856262,
"mask/share_step_conf": 0.1097106784582138,
"num_tokens": 41124077.0,
"reward": 1.3372976779937744,
"reward_std": 0.3383023142814636,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7344995737075806,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8177042007446289,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.6509227752685547,
"adv/mean_abs_reasoning": 0.5465054512023926,
"adv/mean_abs_step_conf": 0.7464802265167236,
"adv/ratio_final_to_reasoning": 1.1910636460010209,
"adv/ratio_step_to_reasoning": 1.3659154265970395,
"adv/std_final_conf": 0.8595633506774902,
"adv/std_reasoning": 0.7754468321800232,
"adv/std_step_conf": 0.9359642267227173,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7657847069478587,
"calib/avg_num_step_conf": 4.67578125,
"calib/ece": 0.231004016064257,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5502008032128514,
"calib/gap": 0.43655065338336146,
"calib/mean_conf": 0.6228915662650603,
"calib/mu_c": 0.8297709923664123,
"calib/mu_w": 0.3932203389830508,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.16389558232931722,
"calib/std_conf": 0.4465374059457861,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5231329690346084,
"calib/step_q_c_n": 549.0,
"calib/step_q_gap": 0.233674635701275,
"calib/step_q_w": 0.2894583333333334,
"calib/step_q_w_n": 648.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3024.0,
"completions/max_terminated_length": 3024.0,
"completions/mean_length": 467.9296875,
"completions/mean_terminated_length": 473.478271484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.05877178534865379,
"kl": 0.13330078125,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.0617,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03615127131342888,
"mask/share_reasoning": 0.8481602668762207,
"mask/share_step_conf": 0.10396970063447952,
"num_tokens": 41350787.0,
"reward": 1.354670763015747,
"reward_std": 0.26534274220466614,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7350628972053528,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.839092493057251,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.5814087986946106,
"adv/mean_abs_reasoning": 0.5024453997612,
"adv/mean_abs_step_conf": 0.7515132427215576,
"adv/ratio_final_to_reasoning": 1.1571581687700594,
"adv/ratio_step_to_reasoning": 1.4957112615196269,
"adv/std_final_conf": 0.8138114213943481,
"adv/std_reasoning": 0.7575085759162903,
"adv/std_step_conf": 0.9355971217155457,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7794045275590551,
"calib/avg_num_step_conf": 4.58984375,
"calib/ece": 0.22841568627450987,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5176470588235295,
"calib/gap": 0.4784271653543307,
"calib/mean_conf": 0.5925254901960784,
"calib/mu_c": 0.8326771653543307,
"calib/mu_w": 0.35424999999999995,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1614509803921569,
"calib/std_conf": 0.46045974290967717,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5064904580152672,
"calib/step_q_c_n": 524.0,
"calib/step_q_gap": 0.18329998182479096,
"calib/step_q_w": 0.3231904761904762,
"calib/step_q_w_n": 651.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2256.0,
"completions/max_terminated_length": 2256.0,
"completions/mean_length": 461.1015625,
"completions/mean_terminated_length": 461.1015625,
"completions/min_length": 125.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.1984,
"grad_norm": 0.06877963989973068,
"kl": 0.150177001953125,
"learning_rate": 3.8888888888888895e-07,
"loss": 0.0762,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03738259896636009,
"mask/share_reasoning": 0.8527088165283203,
"mask/share_step_conf": 0.1099085807800293,
"num_tokens": 41573869.0,
"reward": 1.4036998748779297,
"reward_std": 0.21290750801563263,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7652618885040283,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8718501925468445,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.700536847114563,
"adv/mean_abs_reasoning": 0.603737473487854,
"adv/mean_abs_step_conf": 0.7438031435012817,
"adv/ratio_final_to_reasoning": 1.1603335520445484,
"adv/ratio_step_to_reasoning": 1.2319976416309788,
"adv/std_final_conf": 0.8750166893005371,
"adv/std_reasoning": 0.8099066019058228,
"adv/std_step_conf": 0.9360366463661194,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6577019471756314,
"calib/avg_num_step_conf": 5.4609375,
"calib/ece": 0.307968,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.2565840241629714,
"calib/mean_conf": 0.594352,
"calib/mu_c": 0.7308547008547007,
"calib/mu_w": 0.47427067669172934,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21716000000000002,
"calib/std_conf": 0.43475091270289473,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4549644864847921,
"calib/step_q_c_n": 574.0,
"calib/step_q_gap": 0.12196934085372407,
"calib/step_q_w": 0.332995145631068,
"calib/step_q_w_n": 824.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3045.0,
"completions/max_terminated_length": 3045.0,
"completions/mean_length": 503.4765625,
"completions/mean_terminated_length": 503.4765625,
"completions/min_length": 81.0,
"completions/min_terminated_length": 81.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.11839123070240021,
"kl": 0.1331329345703125,
"learning_rate": 3.611111111111111e-07,
"loss": 0.0877,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03556034713983536,
"mask/share_reasoning": 0.8487687706947327,
"mask/share_step_conf": 0.11567091196775436,
"num_tokens": 41804303.0,
"reward": 1.2941031455993652,
"reward_std": 0.2728716731071472,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.6580247282981873,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.821340799331665,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.7011606693267822,
"adv/mean_abs_reasoning": 0.5370633602142334,
"adv/mean_abs_step_conf": 0.7369471192359924,
"adv/ratio_final_to_reasoning": 1.3055455301346395,
"adv/ratio_step_to_reasoning": 1.3721791018140315,
"adv/std_final_conf": 0.9064499139785767,
"adv/std_reasoning": 0.7926884293556213,
"adv/std_step_conf": 0.9360687732696533,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.760989010989011,
"calib/avg_num_step_conf": 4.55859375,
"calib/ece": 0.24700000000000008,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.56640625,
"calib/gap": 0.44895677655677657,
"calib/mean_conf": 0.6251093750000001,
"calib/mu_c": 0.8530952380952381,
"calib/mu_w": 0.40413846153846156,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18996093750000007,
"calib/std_conf": 0.4533882034604665,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5160293577981653,
"calib/step_q_c_n": 545.0,
"calib/step_q_gap": 0.15879784654414603,
"calib/step_q_w": 0.35723151125401925,
"calib/step_q_w_n": 622.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1397.0,
"completions/max_terminated_length": 1397.0,
"completions/mean_length": 437.6328125,
"completions/mean_terminated_length": 439.34906005859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.06301950663328171,
"kl": 0.136566162109375,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0086,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03884129226207733,
"mask/share_reasoning": 0.8422836065292358,
"mask/share_step_conf": 0.11496884375810623,
"num_tokens": 42020409.0,
"reward": 1.3785154819488525,
"reward_std": 0.2768915891647339,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7512555122375488,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8536690473556519,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.6124563217163086,
"adv/mean_abs_reasoning": 0.45022422075271606,
"adv/mean_abs_step_conf": 0.7598081827163696,
"adv/ratio_final_to_reasoning": 1.3603362357812774,
"adv/ratio_step_to_reasoning": 1.6876217397768376,
"adv/std_final_conf": 0.8141918778419495,
"adv/std_reasoning": 0.7205761075019836,
"adv/std_step_conf": 0.9360559582710266,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7318624712964688,
"calib/avg_num_step_conf": 4.2265625,
"calib/ece": 0.24921259842519683,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.44881889763779526,
"calib/gap": 0.40485074163718743,
"calib/mean_conf": 0.5106299212598425,
"calib/mu_c": 0.7194308943089431,
"calib/mu_w": 0.3145801526717557,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.13779527559055116,
"calib/std_conf": 0.4628859284173213,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5091338582677166,
"calib/step_q_c_n": 508.0,
"calib/step_q_gap": 0.22357636697851796,
"calib/step_q_w": 0.28555749128919866,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1712.0,
"completions/max_terminated_length": 1712.0,
"completions/mean_length": 424.0234375,
"completions/mean_terminated_length": 427.3622131347656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.2016,
"grad_norm": 0.04162989556789398,
"kl": 0.1527099609375,
"learning_rate": 3.055555555555556e-07,
"loss": -0.0255,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03960593044757843,
"mask/share_reasoning": 0.8452208042144775,
"mask/share_step_conf": 0.10736077278852463,
"num_tokens": 42236727.0,
"reward": 1.3523257970809937,
"reward_std": 0.2419973611831665,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.731751561164856,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8391844034194946,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.7057432532310486,
"adv/mean_abs_reasoning": 0.6029276251792908,
"adv/mean_abs_step_conf": 0.7524210214614868,
"adv/ratio_final_to_reasoning": 1.1705273133258471,
"adv/ratio_step_to_reasoning": 1.2479458396648215,
"adv/std_final_conf": 0.8848484754562378,
"adv/std_reasoning": 0.8428612947463989,
"adv/std_step_conf": 0.9360077381134033,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7822839856305539,
"calib/avg_num_step_conf": 4.703125,
"calib/ece": 0.19853174603174595,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4801587301587302,
"calib/gap": 0.499896010588013,
"calib/mean_conf": 0.5331349206349206,
"calib/mu_c": 0.7771317829457365,
"calib/mu_w": 0.2772357723577235,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1098809523809523,
"calib/std_conf": 0.4639368432177326,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.49771258503401355,
"calib/step_q_c_n": 588.0,
"calib/step_q_gap": 0.1850534941249226,
"calib/step_q_w": 0.31265909090909094,
"calib/step_q_w_n": 616.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2997.0,
"completions/max_terminated_length": 2997.0,
"completions/mean_length": 486.234375,
"completions/mean_terminated_length": 486.234375,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.06818609684705734,
"kl": 0.1410675048828125,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.033,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.034055113792419434,
"mask/share_reasoning": 0.8611304759979248,
"mask/share_step_conf": 0.10481436550617218,
"num_tokens": 42466811.0,
"reward": 1.3780144453048706,
"reward_std": 0.2664412260055542,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7680996656417847,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8447458744049072,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.584932267665863,
"adv/mean_abs_reasoning": 0.46373453736305237,
"adv/mean_abs_step_conf": 0.7547152638435364,
"adv/ratio_final_to_reasoning": 1.2613515288121109,
"adv/ratio_step_to_reasoning": 1.6274726228826872,
"adv/std_final_conf": 0.8164354562759399,
"adv/std_reasoning": 0.7392594814300537,
"adv/std_step_conf": 0.9352312684059143,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7241357069143446,
"calib/avg_num_step_conf": 5.58203125,
"calib/ece": 0.2514,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.504,
"calib/gap": 0.3930263157894736,
"calib/mean_conf": 0.57172,
"calib/mu_c": 0.7855263157894736,
"calib/mu_w": 0.3925,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18356000000000003,
"calib/std_conf": 0.4546654172025843,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.49432965517241373,
"calib/step_q_c_n": 551.0,
"calib/step_q_gap": 0.1917289718011153,
"calib/step_q_w": 0.3026006833712984,
"calib/step_q_w_n": 878.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2809.0,
"completions/max_terminated_length": 2809.0,
"completions/mean_length": 453.77734375,
"completions/mean_terminated_length": 457.35040283203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.06090309098362923,
"kl": 0.1447601318359375,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0163,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.040286559611558914,
"mask/share_reasoning": 0.821499228477478,
"mask/share_step_conf": 0.13040170073509216,
"num_tokens": 42687146.0,
"reward": 1.3344109058380127,
"reward_std": 0.23764482140541077,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.7097808122634888,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8373328447341919,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.6598813533782959,
"adv/mean_abs_reasoning": 0.465440958738327,
"adv/mean_abs_step_conf": 0.7431613206863403,
"adv/ratio_final_to_reasoning": 1.4177552297224536,
"adv/ratio_step_to_reasoning": 1.596682257403455,
"adv/std_final_conf": 0.8603542447090149,
"adv/std_reasoning": 0.7393536567687988,
"adv/std_step_conf": 0.9358422756195068,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8130050505050506,
"calib/avg_num_step_conf": 4.421875,
"calib/ece": 0.19507936507936507,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.4603174603174603,
"calib/gap": 0.5058333333333334,
"calib/mean_conf": 0.5258730158730158,
"calib/mu_c": 0.7908333333333334,
"calib/mu_w": 0.285,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.12238095238095237,
"calib/std_conf": 0.45866398435205064,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5151640316205534,
"calib/step_q_c_n": 506.0,
"calib/step_q_gap": 0.19083975047039364,
"calib/step_q_w": 0.3243242811501597,
"calib/step_q_w_n": 626.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2099.0,
"completions/max_terminated_length": 2099.0,
"completions/mean_length": 449.78515625,
"completions/mean_terminated_length": 451.5490417480469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.2048,
"grad_norm": 0.04594694823026657,
"kl": 0.146728515625,
"learning_rate": 2.2222222222222224e-07,
"loss": -0.0829,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.038958802819252014,
"mask/share_reasoning": 0.8438724875450134,
"mask/share_step_conf": 0.11326245963573456,
"num_tokens": 42907267.0,
"reward": 1.3796112537384033,
"reward_std": 0.30651265382766724,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7699777483940125,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8500910997390747,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.735534131526947,
"adv/mean_abs_reasoning": 0.6952435970306396,
"adv/mean_abs_step_conf": 0.7746602296829224,
"adv/ratio_final_to_reasoning": 1.0579516800562951,
"adv/ratio_step_to_reasoning": 1.1142284991785156,
"adv/std_final_conf": 0.8841529488563538,
"adv/std_reasoning": 0.8748270869255066,
"adv/std_step_conf": 0.9360180497169495,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7600585688820982,
"calib/avg_num_step_conf": 4.58984375,
"calib/ece": 0.23414741035856573,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4063745019920319,
"calib/gap": 0.4103379169849759,
"calib/mean_conf": 0.48848207171314745,
"calib/mu_c": 0.7042773109243698,
"calib/mu_w": 0.29393939393939394,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.12426294820717132,
"calib/std_conf": 0.45367003941042455,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4586973684210527,
"calib/step_q_c_n": 532.0,
"calib/step_q_gap": 0.14031006930233836,
"calib/step_q_w": 0.31838729911871433,
"calib/step_q_w_n": 643.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2159.0,
"completions/max_terminated_length": 2159.0,
"completions/mean_length": 451.5078125,
"completions/mean_terminated_length": 455.06298828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 66.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.0538572259247303,
"kl": 0.145263671875,
"learning_rate": 1.9444444444444447e-07,
"loss": -0.0346,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.0355381965637207,
"mask/share_reasoning": 0.8476927280426025,
"mask/share_step_conf": 0.10895660519599915,
"num_tokens": 43128565.0,
"reward": 1.3470118045806885,
"reward_std": 0.28083106875419617,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.7346318960189819,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8351645469665527,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.6366356015205383,
"adv/mean_abs_reasoning": 0.5255308747291565,
"adv/mean_abs_step_conf": 0.7562828063964844,
"adv/ratio_final_to_reasoning": 1.2114142710428613,
"adv/ratio_step_to_reasoning": 1.4390834920712332,
"adv/std_final_conf": 0.8287248611450195,
"adv/std_reasoning": 0.7394238710403442,
"adv/std_step_conf": 0.9351856708526611,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7922456575682382,
"calib/avg_num_step_conf": 4.4140625,
"calib/ece": 0.22039370078740161,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5078740157480315,
"calib/gap": 0.5165434243176177,
"calib/mean_conf": 0.5574015748031497,
"calib/mu_c": 0.821774193548387,
"calib/mu_w": 0.30523076923076925,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14480314960629925,
"calib/std_conf": 0.4716560080384642,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.49373540856031123,
"calib/step_q_c_n": 514.0,
"calib/step_q_gap": 0.1829614248204739,
"calib/step_q_w": 0.31077398373983733,
"calib/step_q_w_n": 615.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1607.0,
"completions/max_terminated_length": 1607.0,
"completions/mean_length": 412.2265625,
"completions/mean_terminated_length": 413.8431701660156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.039248399436473846,
"kl": 0.1414642333984375,
"learning_rate": 1.6666666666666668e-07,
"loss": 0.0308,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04142865911126137,
"mask/share_reasoning": 0.8406766653060913,
"mask/share_step_conf": 0.11398839950561523,
"num_tokens": 43340039.0,
"reward": 1.3840529918670654,
"reward_std": 0.26664406061172485,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.7710093259811401,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8512825965881348,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.6953327655792236,
"adv/mean_abs_reasoning": 0.5130212306976318,
"adv/mean_abs_step_conf": 0.7610127925872803,
"adv/ratio_final_to_reasoning": 1.3553684018762255,
"adv/ratio_step_to_reasoning": 1.4833943452055915,
"adv/std_final_conf": 0.8780831098556519,
"adv/std_reasoning": 0.7575872540473938,
"adv/std_step_conf": 0.9359692931175232,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7829176896239824,
"calib/avg_num_step_conf": 4.75,
"calib/ece": 0.22101195219123498,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4900398406374502,
"calib/gap": 0.49266817418271114,
"calib/mean_conf": 0.5397848605577689,
"calib/mu_c": 0.7537323943661973,
"calib/mu_w": 0.2610642201834862,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.09752988047808757,
"calib/std_conf": 0.46986335380076294,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5041426307448496,
"calib/step_q_c_n": 631.0,
"calib/step_q_gap": 0.2346674170696359,
"calib/step_q_w": 0.26947521367521365,
"calib/step_q_w_n": 585.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2777.0,
"completions/max_terminated_length": 2777.0,
"completions/mean_length": 446.8046875,
"completions/mean_terminated_length": 452.102783203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.208,
"grad_norm": 0.06146930903196335,
"kl": 0.1854705810546875,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.0375,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.037648532539606094,
"mask/share_reasoning": 0.8388372659683228,
"mask/share_step_conf": 0.11179547011852264,
"num_tokens": 43560405.0,
"reward": 1.3826078176498413,
"reward_std": 0.271342396736145,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.759816586971283,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8491839170455933,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.6164750456809998,
"adv/mean_abs_reasoning": 0.39999616146087646,
"adv/mean_abs_step_conf": 0.7732023000717163,
"adv/ratio_final_to_reasoning": 1.541202404116813,
"adv/ratio_step_to_reasoning": 1.933024300152798,
"adv/std_final_conf": 0.841810405254364,
"adv/std_reasoning": 0.6613016724586487,
"adv/std_step_conf": 0.9360251426696777,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7458754001477469,
"calib/avg_num_step_conf": 4.1953125,
"calib/ece": 0.2534117647058824,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6078431372549019,
"calib/gap": 0.3940137897069688,
"calib/mean_conf": 0.680156862745098,
"calib/mu_c": 0.8717557251908398,
"calib/mu_w": 0.47774193548387095,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20992156862745098,
"calib/std_conf": 0.43007520421018475,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5337148014440433,
"calib/step_q_c_n": 554.0,
"calib/step_q_gap": 0.12296095529019707,
"calib/step_q_w": 0.4107538461538462,
"calib/step_q_w_n": 520.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2019.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 392.58984375,
"completions/mean_terminated_length": 392.58984375,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.05715157836675644,
"kl": 0.150390625,
"learning_rate": 1.1111111111111112e-07,
"loss": 0.035,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.042884960770606995,
"mask/share_reasoning": 0.8413792848587036,
"mask/share_step_conf": 0.115735724568367,
"num_tokens": 43763452.0,
"reward": 1.3373997211456299,
"reward_std": 0.24315345287322998,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7315140962600708,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8208614587783813,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.6945164203643799,
"adv/mean_abs_reasoning": 0.5376187562942505,
"adv/mean_abs_step_conf": 0.7531794905662537,
"adv/ratio_final_to_reasoning": 1.2918381515399657,
"adv/ratio_step_to_reasoning": 1.400954638855684,
"adv/std_final_conf": 0.8717400431632996,
"adv/std_reasoning": 0.7928566932678223,
"adv/std_step_conf": 0.9359936714172363,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7045021590043179,
"calib/avg_num_step_conf": 4.9140625,
"calib/ece": 0.2702788844621514,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5059760956175299,
"calib/gap": 0.36873571247142484,
"calib/mean_conf": 0.5565737051792828,
"calib/mu_c": 0.7431451612903225,
"calib/mu_w": 0.37440944881889765,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1664143426294821,
"calib/std_conf": 0.4634852094600902,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.48389640410958906,
"calib/step_q_c_n": 584.0,
"calib/step_q_gap": 0.16362489075647335,
"calib/step_q_w": 0.3202715133531157,
"calib/step_q_w_n": 674.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2870.0,
"completions/max_terminated_length": 2870.0,
"completions/mean_length": 480.3203125,
"completions/mean_terminated_length": 482.2039489746094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.05007997155189514,
"kl": 0.13787841796875,
"learning_rate": 8.333333333333334e-08,
"loss": -0.0201,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03720054030418396,
"mask/share_reasoning": 0.8442490100860596,
"mask/share_step_conf": 0.11464422941207886,
"num_tokens": 43991470.0,
"reward": 1.343414545059204,
"reward_std": 0.2829974293708801,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.7016687393188477,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8460958003997803,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.6337748765945435,
"adv/mean_abs_reasoning": 0.5334087014198303,
"adv/mean_abs_step_conf": 0.7811760902404785,
"adv/ratio_final_to_reasoning": 1.188159988593283,
"adv/ratio_step_to_reasoning": 1.4644982133983557,
"adv/std_final_conf": 0.8478825688362122,
"adv/std_reasoning": 0.7753887176513672,
"adv/std_step_conf": 0.9359259605407715,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8029621297337832,
"calib/avg_num_step_conf": 4.62890625,
"calib/ece": 0.18976284584980235,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.4505928853754941,
"calib/gap": 0.5294956880389953,
"calib/mean_conf": 0.5101581027667984,
"calib/mu_c": 0.7759523809523811,
"calib/mu_w": 0.2464566929133858,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10094861660079052,
"calib/std_conf": 0.46679291680297397,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5045838926174496,
"calib/step_q_c_n": 596.0,
"calib/step_q_gap": 0.17904772962933418,
"calib/step_q_w": 0.32553616298811544,
"calib/step_q_w_n": 589.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2587.0,
"completions/max_terminated_length": 2587.0,
"completions/mean_length": 407.10546875,
"completions/mean_terminated_length": 408.7019958496094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.2112,
"grad_norm": 0.046563468873500824,
"kl": 0.1541595458984375,
"learning_rate": 5.555555555555556e-08,
"loss": -0.0071,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.041801366955041885,
"mask/share_reasoning": 0.8286471962928772,
"mask/share_step_conf": 0.12564517557621002,
"num_tokens": 44201073.0,
"reward": 1.4073433876037598,
"reward_std": 0.24024641513824463,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7873682975769043,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8656122088432312,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.6945219039916992,
"adv/mean_abs_reasoning": 0.6268808841705322,
"adv/mean_abs_step_conf": 0.7525413036346436,
"adv/ratio_final_to_reasoning": 1.1079009131227016,
"adv/ratio_step_to_reasoning": 1.2004534236681679,
"adv/std_final_conf": 0.8746665716171265,
"adv/std_reasoning": 0.8428965210914612,
"adv/std_step_conf": 0.9360443353652954,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6880782695673918,
"calib/avg_num_step_conf": 4.6953125,
"calib/ece": 0.3001581027667984,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5533596837944664,
"calib/gap": 0.3234758689672418,
"calib/mean_conf": 0.6152569169960476,
"calib/mu_c": 0.7737984496124031,
"calib/mu_w": 0.4503225806451613,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2027667984189723,
"calib/std_conf": 0.4535808703312079,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5238173913043479,
"calib/step_q_c_n": 575.0,
"calib/step_q_gap": 0.1828824630746828,
"calib/step_q_w": 0.3409349282296651,
"calib/step_q_w_n": 627.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2129.0,
"completions/max_terminated_length": 2129.0,
"completions/mean_length": 465.59375,
"completions/mean_terminated_length": 467.4196472167969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.07331021130084991,
"kl": 0.14813232421875,
"learning_rate": 2.777777777777778e-08,
"loss": 0.0272,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03809141740202904,
"mask/share_reasoning": 0.840973973274231,
"mask/share_step_conf": 0.1170283704996109,
"num_tokens": 44424465.0,
"reward": 1.325409173965454,
"reward_std": 0.28530770540237427,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6828891038894653,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8351364731788635,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.5704379081726074,
"adv/mean_abs_reasoning": 0.48373907804489136,
"adv/mean_abs_step_conf": 0.7290881872177124,
"adv/ratio_final_to_reasoning": 1.1792264343788872,
"adv/ratio_step_to_reasoning": 1.5071930722744968,
"adv/std_final_conf": 0.820499062538147,
"adv/std_reasoning": 0.7574408650398254,
"adv/std_step_conf": 0.9358140826225281,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8717167872097449,
"calib/avg_num_step_conf": 3.953125,
"calib/ece": 0.1338339920948617,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.549407114624506,
"calib/gap": 0.647292221799264,
"calib/mean_conf": 0.5969960474308301,
"calib/mu_c": 0.8809859154929577,
"calib/mu_w": 0.2336936936936937,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.08478260869565221,
"calib/std_conf": 0.4614529253953416,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.586984375,
"calib/step_q_c_n": 576.0,
"calib/step_q_gap": 0.27550180619266057,
"calib/step_q_w": 0.31148256880733943,
"calib/step_q_w_n": 436.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2219.0,
"completions/max_terminated_length": 2219.0,
"completions/mean_length": 456.49609375,
"completions/mean_terminated_length": 456.49609375,
"completions/min_length": 105.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.05067267641425133,
"kl": 0.1455535888671875,
"learning_rate": 0.0,
"loss": 0.0631,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.038834765553474426,
"mask/share_reasoning": 0.8590643405914307,
"mask/share_step_conf": 0.1021009087562561,
"num_tokens": 44649376.0,
"reward": 1.457303524017334,
"reward_std": 0.21622072160243988,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.8482663631439209,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8788734078407288,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": 0.050295659240800886,
"train_runtime": 13081.6192,
"train_samples_per_second": 3.914,
"train_steps_per_second": 0.015
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 44649376,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}