Files
PureRL-7B-v7-s2-margin-maskon/trainer_state.json
ModelHub XC df60429042 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-7B-v7-s2-margin-maskon
Source: Original Platform
2026-06-06 07:18:17 +08:00

12044 lines
494 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.7557821869850159,
"adv/mean_abs_reasoning": 0.28040462732315063,
"adv/mean_abs_step_conf": 0.6320238709449768,
"adv/ratio_final_to_reasoning": 2.69532708571895,
"adv/ratio_step_to_reasoning": 2.2539709026149723,
"adv/std_final_conf": 0.9257818460464478,
"adv/std_reasoning": 0.5727222561836243,
"adv/std_step_conf": 0.8462716937065125,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 14.59765625,
"calib/ece": 0.23243902439024394,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.008130081300813009,
"calib/gap": -0.04614489795918364,
"calib/mean_conf": 0.6646341463414636,
"calib/mu_c": 0.6552551020408164,
"calib/mu_w": 0.7014,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05016260162601624,
"calib/std_conf": 0.05917169015101882,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.583372,
"calib/step_q_c_n": 2500.0,
"calib/step_q_gap": -0.0778082748585287,
"calib/step_q_w": 0.6611802748585287,
"calib/step_q_w_n": 1237.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 1943.0,
"completions/max_terminated_length": 1943.0,
"completions/mean_length": 750.2265625,
"completions/mean_terminated_length": 780.7235717773438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 315.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.3166561424732208,
"kl": 0.00047022104263305664,
"learning_rate": 0.0,
"loss": -0.1462,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01929234340786934,
"mask/share_reasoning": 0.7498296499252319,
"mask/share_step_conf": 0.19181546568870544,
"num_tokens": 299642.0,
"reward": 0.5737828612327576,
"reward_std": 0.07307003438472748,
"rewards/accuracy_reward_step": 0.765625,
"rewards/final_brier_reward_step": 0.7708241939544678,
"rewards/format_reward_step": 0.9609375,
"rewards/step_margin_reward": 0.0314289852976799,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.7929245233535767,
"adv/mean_abs_reasoning": 0.4050842523574829,
"adv/mean_abs_step_conf": 0.6475293636322021,
"adv/ratio_final_to_reasoning": 1.9574311238685933,
"adv/ratio_step_to_reasoning": 1.598505396997669,
"adv/std_final_conf": 0.9301473498344421,
"adv/std_reasoning": 0.6612725853919983,
"adv/std_step_conf": 0.8531926274299622,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 14.078125,
"calib/ece": 0.04704724409448811,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.003937007874015748,
"calib/gap": 0.008169981916817282,
"calib/mean_conf": 0.6691732283464566,
"calib/mu_c": 0.6717142857142857,
"calib/mu_w": 0.6635443037974684,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.013622047244094477,
"calib/std_conf": 0.060200661111313364,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5911686697057605,
"calib/step_q_c_n": 2413.0,
"calib/step_q_gap": -0.011375410898773475,
"calib/step_q_w": 0.602544080604534,
"calib/step_q_w_n": 1191.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2563.0,
"completions/max_terminated_length": 2563.0,
"completions/mean_length": 867.8828125,
"completions/mean_terminated_length": 871.2863159179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 375.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.5694341063499451,
"kl": 0.0006206929683685303,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0783,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.01878987066447735,
"mask/share_reasoning": 0.7856365442276001,
"mask/share_step_conf": 0.1916673481464386,
"num_tokens": 625108.0,
"reward": 0.5773488879203796,
"reward_std": 0.08374661952257156,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7790628671646118,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.04047856479883194,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.7365277409553528,
"adv/mean_abs_reasoning": 0.3439093828201294,
"adv/mean_abs_step_conf": 0.610215425491333,
"adv/ratio_final_to_reasoning": 2.1416331677713187,
"adv/ratio_step_to_reasoning": 1.7743494535898903,
"adv/std_final_conf": 0.928483247756958,
"adv/std_reasoning": 0.6401665806770325,
"adv/std_step_conf": 0.8361523747444153,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.54296875,
"calib/ece": 0.14905882352941177,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00392156862745098,
"calib/gap": -0.026731818181818134,
"calib/mean_conf": 0.6652156862745098,
"calib/mu_c": 0.65945,
"calib/mu_w": 0.6861818181818181,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.014980392156862742,
"calib/std_conf": 0.05215223331953581,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.583673786407767,
"calib/step_q_c_n": 2575.0,
"calib/step_q_gap": -0.02829930776263656,
"calib/step_q_w": 0.6119730941704036,
"calib/step_q_w_n": 892.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1534.0,
"completions/max_terminated_length": 1534.0,
"completions/mean_length": 798.23828125,
"completions/mean_terminated_length": 804.5236206054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 329.0,
"epoch": 0.0032,
"grad_norm": 0.2995379865169525,
"kl": 0.0009461045265197754,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0562,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.019037947058677673,
"mask/share_reasoning": 0.7804015278816223,
"mask/share_step_conf": 0.1927480697631836,
"num_tokens": 934713.0,
"reward": 0.5961868762969971,
"reward_std": 0.07548847794532776,
"rewards/accuracy_reward_step": 0.78125,
"rewards/final_brier_reward_step": 0.8017418384552002,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.03516314923763275,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.7282766699790955,
"adv/mean_abs_reasoning": 0.32462894916534424,
"adv/mean_abs_step_conf": 0.6020452976226807,
"adv/ratio_final_to_reasoning": 2.243412584895995,
"adv/ratio_step_to_reasoning": 1.8545644162992967,
"adv/std_final_conf": 0.9282972812652588,
"adv/std_reasoning": 0.6184795498847961,
"adv/std_step_conf": 0.8494290709495544,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 14.33203125,
"calib/ece": 0.11106299212598428,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.007874015748031496,
"calib/gap": -0.030018018018017845,
"calib/mean_conf": 0.6750787401574803,
"calib/mu_c": 0.6663333333333333,
"calib/mu_w": 0.6963513513513512,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03874015748031496,
"calib/std_conf": 0.0618783549220696,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5895343035343035,
"calib/step_q_c_n": 2405.0,
"calib/step_q_gap": -0.021051139503671212,
"calib/step_q_w": 0.6105854430379747,
"calib/step_q_w_n": 1264.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2346.0,
"completions/max_terminated_length": 2346.0,
"completions/mean_length": 871.1015625,
"completions/mean_terminated_length": 881.4308471679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 381.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.8032536506652832,
"kl": 0.0005264580249786377,
"learning_rate": 7.5e-07,
"loss": -0.0951,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.017951160669326782,
"mask/share_reasoning": 0.7815943956375122,
"mask/share_step_conf": 0.1887357234954834,
"num_tokens": 1263883.0,
"reward": 0.5748437643051147,
"reward_std": 0.07029317319393158,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7701238393783569,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.04050109535455704,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.7468620538711548,
"adv/mean_abs_reasoning": 0.28012946248054504,
"adv/mean_abs_step_conf": 0.45508497953414917,
"adv/ratio_final_to_reasoning": 2.6661317494336187,
"adv/ratio_step_to_reasoning": 1.6245523605563437,
"adv/std_final_conf": 0.930158793926239,
"adv/std_reasoning": 0.5726768970489502,
"adv/std_step_conf": 0.7269405722618103,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 14.49609375,
"calib/ece": 0.17788000000000004,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.012,
"calib/gap": -0.04787837094111158,
"calib/mean_conf": 0.67148,
"calib/mu_c": 0.653860759493671,
"calib/mu_w": 0.7017391304347825,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10867999999999997,
"calib/std_conf": 0.0702894700506413,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5855629139072848,
"calib/step_q_c_n": 1963.0,
"calib/step_q_gap": -0.04010642247715468,
"calib/step_q_w": 0.6256693363844394,
"calib/step_q_w_n": 1748.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2953.0,
"completions/max_terminated_length": 2953.0,
"completions/mean_length": 847.38671875,
"completions/mean_terminated_length": 860.8373413085938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 300.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.24338386952877045,
"kl": 0.0006460249423980713,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0796,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.0191742405295372,
"mask/share_reasoning": 0.7722139358520508,
"mask/share_step_conf": 0.1929868459701538,
"num_tokens": 1587502.0,
"reward": 0.530252993106842,
"reward_std": 0.06734541058540344,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7213417887687683,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.020414207130670547,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.745267927646637,
"adv/mean_abs_reasoning": 0.2637953758239746,
"adv/mean_abs_step_conf": 0.6003158688545227,
"adv/ratio_final_to_reasoning": 2.825174343252853,
"adv/ratio_step_to_reasoning": 2.2756876119583747,
"adv/std_final_conf": 0.9142794013023376,
"adv/std_reasoning": 0.5482735633850098,
"adv/std_step_conf": 0.8366710543632507,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 12.81640625,
"calib/ece": 0.08827450980392164,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00392156862745098,
"calib/gap": -0.0011739983188567837,
"calib/mean_conf": 0.6561960784313725,
"calib/mu_c": 0.6558139534883721,
"calib/mu_w": 0.6569879518072289,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.034980392156862744,
"calib/std_conf": 0.05665202334716018,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5864367816091954,
"calib/step_q_c_n": 2175.0,
"calib/step_q_gap": -0.004973706636735797,
"calib/step_q_w": 0.5914104882459312,
"calib/step_q_w_n": 1106.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2192.0,
"completions/max_terminated_length": 2192.0,
"completions/mean_length": 753.89453125,
"completions/mean_terminated_length": 759.8306884765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 348.0,
"epoch": 0.0064,
"grad_norm": 0.36567795276641846,
"kl": 0.001254260540008545,
"learning_rate": 1.25e-06,
"loss": -0.0765,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02114243432879448,
"mask/share_reasoning": 0.7748997211456299,
"mask/share_step_conf": 0.19614538550376892,
"num_tokens": 1886451.0,
"reward": 0.5701749324798584,
"reward_std": 0.05616578459739685,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.773360550403595,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.033395521342754364,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.7575596570968628,
"adv/mean_abs_reasoning": 0.36128178238868713,
"adv/mean_abs_step_conf": 0.632476806640625,
"adv/ratio_final_to_reasoning": 2.096866473831326,
"adv/ratio_step_to_reasoning": 1.750646828796286,
"adv/std_final_conf": 0.9292610287666321,
"adv/std_reasoning": 0.6404456496238708,
"adv/std_step_conf": 0.8506557941436768,
"calib/answer_extract_rate": 0.953125,
"calib/avg_num_step_conf": 15.05078125,
"calib/ece": 0.08143442622950824,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.01089819376026302,
"calib/mean_conf": 0.6709426229508196,
"calib/mu_c": 0.6678160919540229,
"calib/mu_w": 0.6787142857142859,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.019631147540983645,
"calib/std_conf": 0.04872485466343217,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5869490800171159,
"calib/step_q_c_n": 2337.0,
"calib/step_q_gap": -0.056236935814018674,
"calib/step_q_w": 0.6431860158311345,
"calib/step_q_w_n": 1516.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2855.0,
"completions/max_terminated_length": 2855.0,
"completions/mean_length": 861.77734375,
"completions/mean_terminated_length": 896.8088989257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 347.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.2327389270067215,
"kl": 0.0005173087120056152,
"learning_rate": 1.5e-06,
"loss": -0.1872,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.01713840663433075,
"mask/share_reasoning": 0.7627436518669128,
"mask/share_step_conf": 0.1810554563999176,
"num_tokens": 2214490.0,
"reward": 0.5554234981536865,
"reward_std": 0.0987781286239624,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7499246001243591,
"rewards/format_reward_step": 0.953125,
"rewards/step_margin_reward": 0.034359902143478394,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.7442705631256104,
"adv/mean_abs_reasoning": 0.3813665807247162,
"adv/mean_abs_step_conf": 0.5766507387161255,
"adv/ratio_final_to_reasoning": 1.9515883161845557,
"adv/ratio_step_to_reasoning": 1.5120641604733904,
"adv/std_final_conf": 0.9289296269416809,
"adv/std_reasoning": 0.6613633036613464,
"adv/std_step_conf": 0.8194501399993896,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 15.109375,
"calib/ece": 0.12160642570281124,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.012048192771084338,
"calib/gap": -0.03541341991341973,
"calib/mean_conf": 0.672128514056225,
"calib/mu_c": 0.6601818181818182,
"calib/mu_w": 0.6955952380952379,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06554216867469877,
"calib/std_conf": 0.06362452254971758,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5869551427276846,
"calib/step_q_c_n": 2207.0,
"calib/step_q_gap": -0.036259788036914986,
"calib/step_q_w": 0.6232149307645996,
"calib/step_q_w_n": 1661.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2974.0,
"completions/max_terminated_length": 2974.0,
"completions/mean_length": 860.09765625,
"completions/mean_terminated_length": 880.7400512695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 339.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.2793032228946686,
"kl": 0.0006050467491149902,
"learning_rate": 1.75e-06,
"loss": -0.1417,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.018345724791288376,
"mask/share_reasoning": 0.7716814875602722,
"mask/share_step_conf": 0.18653526902198792,
"num_tokens": 2541187.0,
"reward": 0.5402415990829468,
"reward_std": 0.09596795588731766,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7357991933822632,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.021246377378702164,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.7212363481521606,
"adv/mean_abs_reasoning": 0.2584291398525238,
"adv/mean_abs_step_conf": 0.6063222885131836,
"adv/ratio_final_to_reasoning": 2.790847613251912,
"adv/ratio_step_to_reasoning": 2.3461839050317232,
"adv/std_final_conf": 0.9257317185401917,
"adv/std_reasoning": 0.572688639163971,
"adv/std_step_conf": 0.8504953384399414,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 14.3828125,
"calib/ece": 0.10083665338645421,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.027888446215139442,
"calib/gap": 0.0267156366092538,
"calib/mean_conf": 0.6725498007968128,
"calib/mu_c": 0.6792553191489362,
"calib/mu_w": 0.6525396825396824,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.012191235059760962,
"calib/std_conf": 0.07387855134317332,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6040762041696621,
"calib/step_q_c_n": 2782.0,
"calib/step_q_gap": 0.007298426391884272,
"calib/step_q_w": 0.5967777777777779,
"calib/step_q_w_n": 900.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2508.0,
"completions/max_terminated_length": 2508.0,
"completions/mean_length": 860.6171875,
"completions/mean_terminated_length": 870.8221435546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 357.0,
"epoch": 0.0096,
"grad_norm": 1.0299285650253296,
"kl": 0.0008474588394165039,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.1154,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.01888362690806389,
"mask/share_reasoning": 0.7786691188812256,
"mask/share_step_conf": 0.19072850048542023,
"num_tokens": 2869041.0,
"reward": 0.5908889770507812,
"reward_std": 0.07018930464982986,
"rewards/accuracy_reward_step": 0.734375,
"rewards/final_brier_reward_step": 0.7949097156524658,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.043899379670619965,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.7579138875007629,
"adv/mean_abs_reasoning": 0.41535621881484985,
"adv/mean_abs_step_conf": 0.6774593591690063,
"adv/ratio_final_to_reasoning": 1.8247322494974185,
"adv/ratio_step_to_reasoning": 1.6310321802861756,
"adv/std_final_conf": 0.9309507608413696,
"adv/std_reasoning": 0.7014256119728088,
"adv/std_step_conf": 0.8841572403907776,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 15.11328125,
"calib/ece": 0.11116,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.018304761904761713,
"calib/mean_conf": 0.6741199999999999,
"calib/mu_c": 0.6686285714285716,
"calib/mu_w": 0.6869333333333333,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04264000000000001,
"calib/std_conf": 0.06341155730621982,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5946551724137931,
"calib/step_q_c_n": 2436.0,
"calib/step_q_gap": -0.03770351565319929,
"calib/step_q_w": 0.6323586880669924,
"calib/step_q_w_n": 1433.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2962.0,
"completions/max_terminated_length": 2962.0,
"completions/mean_length": 896.28515625,
"completions/mean_terminated_length": 910.511962890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 366.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.38899895548820496,
"kl": 0.001188516616821289,
"learning_rate": 2.25e-06,
"loss": -0.1247,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.018132302910089493,
"mask/share_reasoning": 0.7785909175872803,
"mask/share_step_conf": 0.18765179812908173,
"num_tokens": 3205290.0,
"reward": 0.564469575881958,
"reward_std": 0.1037634089589119,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7593957185745239,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.03751220554113388,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.7381426095962524,
"adv/mean_abs_reasoning": 0.2654629647731781,
"adv/mean_abs_step_conf": 0.5914384722709656,
"adv/ratio_final_to_reasoning": 2.7805860234664004,
"adv/ratio_step_to_reasoning": 2.227950979061481,
"adv/std_final_conf": 0.9269818663597107,
"adv/std_reasoning": 0.54827481508255,
"adv/std_step_conf": 0.8175468444824219,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 15.15234375,
"calib/ece": 0.17334645669291335,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.015748031496062992,
"calib/gap": -0.05524509803921562,
"calib/mean_conf": 0.6788582677165355,
"calib/mu_c": 0.6605882352941177,
"calib/mu_w": 0.7158333333333333,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09145669291338585,
"calib/std_conf": 0.06792100835917231,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5851105216622458,
"calib/step_q_c_n": 2262.0,
"calib/step_q_gap": -0.0441040732666349,
"calib/step_q_w": 0.6292145949288807,
"calib/step_q_w_n": 1617.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2497.0,
"completions/max_terminated_length": 2497.0,
"completions/mean_length": 910.31640625,
"completions/mean_terminated_length": 913.8863525390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 328.0,
"epoch": 0.011733333333333333,
"grad_norm": 1.270994782447815,
"kl": 0.0012431144714355469,
"learning_rate": 2.5e-06,
"loss": -0.0053,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.018155530095100403,
"mask/share_reasoning": 0.7854121923446655,
"mask/share_step_conf": 0.19252607226371765,
"num_tokens": 3542811.0,
"reward": 0.5488805770874023,
"reward_std": 0.06458449363708496,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7436433434486389,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.02286791056394577,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.7413728833198547,
"adv/mean_abs_reasoning": 0.3635881543159485,
"adv/mean_abs_step_conf": 0.6452409625053406,
"adv/ratio_final_to_reasoning": 2.039045756907749,
"adv/ratio_step_to_reasoning": 1.7746479219579945,
"adv/std_final_conf": 0.9299033880233765,
"adv/std_reasoning": 0.6614682674407959,
"adv/std_step_conf": 0.8669188618659973,
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 14.73828125,
"calib/ece": 0.12053061224489801,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.012244897959183673,
"calib/gap": -0.0223040752351098,
"calib/mean_conf": 0.6707346938775509,
"calib/mu_c": 0.6654545454545454,
"calib/mu_w": 0.6877586206896552,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.013999999999999999,
"calib/std_conf": 0.06331224065489251,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5873842302878598,
"calib/step_q_c_n": 2397.0,
"calib/step_q_gap": -0.06773931622376816,
"calib/step_q_w": 0.655123546511628,
"calib/step_q_w_n": 1376.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 3046.0,
"completions/max_terminated_length": 3046.0,
"completions/mean_length": 778.83984375,
"completions/mean_terminated_length": 810.5,
"completions/min_length": 0.0,
"completions/min_terminated_length": 226.0,
"epoch": 0.0128,
"grad_norm": 0.2657943665981293,
"kl": 0.002060413360595703,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.1714,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.019777968525886536,
"mask/share_reasoning": 0.743057906627655,
"mask/share_step_conf": 0.19810162484645844,
"num_tokens": 3846370.0,
"reward": 0.5683996081352234,
"reward_std": 0.10034461319446564,
"rewards/accuracy_reward_step": 0.73046875,
"rewards/final_brier_reward_step": 0.7643597722053528,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.03493950143456459,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.7224041819572449,
"adv/mean_abs_reasoning": 0.39203640818595886,
"adv/mean_abs_step_conf": 0.6825897693634033,
"adv/ratio_final_to_reasoning": 1.8426966650877463,
"adv/ratio_step_to_reasoning": 1.741138718523365,
"adv/std_final_conf": 0.9304549098014832,
"adv/std_reasoning": 0.6816896200180054,
"adv/std_step_conf": 0.883434534072876,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 13.984375,
"calib/ece": 0.0858964143426295,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.0032125970955758643,
"calib/mean_conf": 0.666374501992032,
"calib/mu_c": 0.6671808510638297,
"calib/mu_w": 0.6639682539682539,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.001633466135458168,
"calib/std_conf": 0.05525748482093671,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.588135593220339,
"calib/step_q_c_n": 2478.0,
"calib/step_q_gap": -0.02999507828601311,
"calib/step_q_w": 0.6181306715063521,
"calib/step_q_w_n": 1102.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2864.0,
"completions/max_terminated_length": 2864.0,
"completions/mean_length": 817.25,
"completions/mean_terminated_length": 830.2222900390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.33146727085113525,
"kl": 0.0026552677154541016,
"learning_rate": 3e-06,
"loss": -0.1326,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.019377902150154114,
"mask/share_reasoning": 0.7720924615859985,
"mask/share_step_conf": 0.19290460646152496,
"num_tokens": 4160178.0,
"reward": 0.5818509459495544,
"reward_std": 0.09568098932504654,
"rewards/accuracy_reward_step": 0.734375,
"rewards/final_brier_reward_step": 0.787639856338501,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.033093370497226715,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.7974610328674316,
"adv/mean_abs_reasoning": 0.41913557052612305,
"adv/mean_abs_step_conf": 0.6118457913398743,
"adv/ratio_final_to_reasoning": 1.902632677695221,
"adv/ratio_step_to_reasoning": 1.4597801627092883,
"adv/std_final_conf": 0.9300772547721863,
"adv/std_reasoning": 0.6614927053451538,
"adv/std_step_conf": 0.837138295173645,
"calib/answer_extract_rate": 0.9453125,
"calib/avg_num_step_conf": 16.3359375,
"calib/ece": 0.1026859504132232,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.01652892561983471,
"calib/gap": -0.014451490171211057,
"calib/mean_conf": 0.6792975206611571,
"calib/mu_c": 0.6747590361445784,
"calib/mu_w": 0.6892105263157895,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.048016528925619875,
"calib/std_conf": 0.06787143225368777,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5958808618504435,
"calib/step_q_c_n": 2367.0,
"calib/step_q_gap": -0.06811930343881267,
"calib/step_q_w": 0.6640001652892562,
"calib/step_q_w_n": 1815.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2500.0,
"completions/max_terminated_length": 2500.0,
"completions/mean_length": 876.8671875,
"completions/mean_terminated_length": 927.5950317382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 262.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.644323468208313,
"kl": 0.0029511451721191406,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.2248,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.017249684780836105,
"mask/share_reasoning": 0.7453184723854065,
"mask/share_step_conf": 0.1827443391084671,
"num_tokens": 4490056.0,
"reward": 0.5408897995948792,
"reward_std": 0.10242260992527008,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7313886880874634,
"rewards/format_reward_step": 0.9453125,
"rewards/step_margin_reward": 0.03164093196392059,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.7296807169914246,
"adv/mean_abs_reasoning": 0.40887928009033203,
"adv/mean_abs_step_conf": 0.6450937986373901,
"adv/ratio_final_to_reasoning": 1.784587169176729,
"adv/ratio_step_to_reasoning": 1.5777121269017893,
"adv/std_final_conf": 0.9294079542160034,
"adv/std_reasoning": 0.7013062834739685,
"adv/std_step_conf": 0.8664496541023254,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 14.42578125,
"calib/ece": 0.12795275590551183,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.007874015748031496,
"calib/gap": -0.03773325318907372,
"calib/mean_conf": 0.67748031496063,
"calib/mu_c": 0.6636645962732919,
"calib/mu_w": 0.7013978494623656,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08578740157480316,
"calib/std_conf": 0.06818188393949375,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5890122249388753,
"calib/step_q_c_n": 2045.0,
"calib/step_q_gap": -0.027777823604814067,
"calib/step_q_w": 0.6167900485436894,
"calib/step_q_w_n": 1648.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2555.0,
"completions/max_terminated_length": 2555.0,
"completions/mean_length": 824.375,
"completions/mean_terminated_length": 830.8661499023438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 348.0,
"epoch": 0.016,
"grad_norm": 0.43208855390548706,
"kl": 0.005416393280029297,
"learning_rate": 3.5e-06,
"loss": -0.0519,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.019296735525131226,
"mask/share_reasoning": 0.7683975696563721,
"mask/share_step_conf": 0.2044931948184967,
"num_tokens": 4808976.0,
"reward": 0.5477840304374695,
"reward_std": 0.09553329646587372,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.738040566444397,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.03330870717763901,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.7689300179481506,
"adv/mean_abs_reasoning": 0.3367258608341217,
"adv/mean_abs_step_conf": 0.5895552039146423,
"adv/ratio_final_to_reasoning": 2.2835490450403566,
"adv/ratio_step_to_reasoning": 1.7508462297912715,
"adv/std_final_conf": 0.9287763237953186,
"adv/std_reasoning": 0.6187803745269775,
"adv/std_step_conf": 0.8190042972564697,
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 16.66015625,
"calib/ece": 0.11865306122448982,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.02222402597402595,
"calib/mean_conf": 0.6964489795918367,
"calib/mu_c": 0.6894642857142858,
"calib/mu_w": 0.7116883116883117,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06469387755102042,
"calib/std_conf": 0.06712798048701368,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6045779348252847,
"calib/step_q_c_n": 2547.0,
"calib/step_q_gap": -0.03265402093722991,
"calib/step_q_w": 0.6372319557625146,
"calib/step_q_w_n": 1718.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2497.0,
"completions/max_terminated_length": 2497.0,
"completions/mean_length": 1002.92578125,
"completions/mean_terminated_length": 1043.695068359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 295.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.24544142186641693,
"kl": 0.0057163238525390625,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.1641,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.015103766694664955,
"mask/share_reasoning": 0.7649943828582764,
"mask/share_step_conf": 0.18083932995796204,
"num_tokens": 5174573.0,
"reward": 0.5389186143875122,
"reward_std": 0.09756513684988022,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7371909618377686,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.017989974468946457,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.7784232497215271,
"adv/mean_abs_reasoning": 0.2690816819667816,
"adv/mean_abs_step_conf": 0.5825159549713135,
"adv/ratio_final_to_reasoning": 2.892888300800885,
"adv/ratio_step_to_reasoning": 2.164829470046295,
"adv/std_final_conf": 0.9275980591773987,
"adv/std_reasoning": 0.5483062267303467,
"adv/std_step_conf": 0.8175146579742432,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 14.57421875,
"calib/ece": 0.1028286852589641,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0199203187250996,
"calib/gap": 0.0046954270923209585,
"calib/mean_conf": 0.6889641434262949,
"calib/mu_c": 0.6901052631578948,
"calib/mu_w": 0.6854098360655738,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.01741035856573707,
"calib/std_conf": 0.07718479603011531,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6035703592814371,
"calib/step_q_c_n": 2672.0,
"calib/step_q_gap": -0.022582331936693234,
"calib/step_q_w": 0.6261526912181303,
"calib/step_q_w_n": 1059.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2517.0,
"completions/max_terminated_length": 2517.0,
"completions/mean_length": 861.125,
"completions/mean_terminated_length": 878.2789306640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 274.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.2567841708660126,
"kl": 0.0070972442626953125,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0918,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.01926950365304947,
"mask/share_reasoning": 0.7539726495742798,
"mask/share_step_conf": 0.20722657442092896,
"num_tokens": 5498549.0,
"reward": 0.5851441621780396,
"reward_std": 0.06767553091049194,
"rewards/accuracy_reward_step": 0.7421875,
"rewards/final_brier_reward_step": 0.7914144396781921,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.034342579543590546,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.766454815864563,
"adv/mean_abs_reasoning": 0.32873159646987915,
"adv/mean_abs_step_conf": 0.636091947555542,
"adv/ratio_final_to_reasoning": 2.3315520141514336,
"adv/ratio_step_to_reasoning": 1.934988770128233,
"adv/std_final_conf": 0.9293180108070374,
"adv/std_reasoning": 0.6187217831611633,
"adv/std_step_conf": 0.8517022728919983,
"calib/answer_extract_rate": 0.953125,
"calib/avg_num_step_conf": 16.29296875,
"calib/ece": 0.1363786008230452,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.00823045267489712,
"calib/gap": -0.024285714285714355,
"calib/mean_conf": 0.6750617283950617,
"calib/mu_c": 0.6666666666666666,
"calib/mu_w": 0.690952380952381,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07855967078189298,
"calib/std_conf": 0.08319925198683596,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5914795678722404,
"calib/step_q_c_n": 2129.0,
"calib/step_q_gap": -0.03671529011012986,
"calib/step_q_w": 0.6281948579823703,
"calib/step_q_w_n": 2042.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2946.0,
"completions/max_terminated_length": 2946.0,
"completions/mean_length": 871.078125,
"completions/mean_terminated_length": 906.48779296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 368.0,
"epoch": 0.0192,
"grad_norm": 0.546768844127655,
"kl": 0.009520530700683594,
"learning_rate": 4.25e-06,
"loss": -0.1916,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.01771453768014908,
"mask/share_reasoning": 0.7553741931915283,
"mask/share_step_conf": 0.18784880638122559,
"num_tokens": 5832265.0,
"reward": 0.5322065353393555,
"reward_std": 0.0910949558019638,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7171124815940857,
"rewards/format_reward_step": 0.94921875,
"rewards/step_margin_reward": 0.03323813155293465,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.7309796810150146,
"adv/mean_abs_reasoning": 0.2801279127597809,
"adv/mean_abs_step_conf": 0.5914919376373291,
"adv/ratio_final_to_reasoning": 2.6094496396789073,
"adv/ratio_step_to_reasoning": 2.111506603572752,
"adv/std_final_conf": 0.913943886756897,
"adv/std_reasoning": 0.5725624561309814,
"adv/std_step_conf": 0.8334521651268005,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 13.203125,
"calib/ece": 0.13434782608695645,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.02455696202531643,
"calib/mean_conf": 0.6576679841897233,
"calib/mu_c": 0.65,
"calib/mu_w": 0.6745569620253165,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05213438735177864,
"calib/std_conf": 0.04752050153246474,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5765921244530872,
"calib/step_q_c_n": 2057.0,
"calib/step_q_gap": -0.04079260721735878,
"calib/step_q_w": 0.6173847316704459,
"calib/step_q_w_n": 1323.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2093.0,
"completions/max_terminated_length": 2093.0,
"completions/mean_length": 736.97265625,
"completions/mean_terminated_length": 748.670654296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 295.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.3068307042121887,
"kl": 0.014101028442382812,
"learning_rate": 4.5e-06,
"loss": -0.061,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.0203668475151062,
"mask/share_reasoning": 0.762851357460022,
"mask/share_step_conf": 0.2011568248271942,
"num_tokens": 6125690.0,
"reward": 0.561945915222168,
"reward_std": 0.05958160012960434,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7624972462654114,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.02780073508620262,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.7632920742034912,
"adv/mean_abs_reasoning": 0.22523455321788788,
"adv/mean_abs_step_conf": 0.6193887591362,
"adv/ratio_final_to_reasoning": 3.3888764547822117,
"adv/ratio_step_to_reasoning": 2.749972196925817,
"adv/std_final_conf": 0.9254392981529236,
"adv/std_reasoning": 0.4959951937198639,
"adv/std_step_conf": 0.8340355753898621,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 14.68359375,
"calib/ece": 0.1906890688259109,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.016194331983805668,
"calib/gap": -0.012711111111111006,
"calib/mean_conf": 0.6513368421052631,
"calib/mu_c": 0.647888888888889,
"calib/mu_w": 0.6606,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0566404858299595,
"calib/std_conf": 0.10182151993567717,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.583936120244246,
"calib/step_q_c_n": 2129.0,
"calib/step_q_gap": -0.06751387975575396,
"calib/step_q_w": 0.65145,
"calib/step_q_w_n": 1630.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2497.0,
"completions/max_terminated_length": 2497.0,
"completions/mean_length": 769.0859375,
"completions/mean_terminated_length": 793.8951416015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 243.0,
"epoch": 0.021333333333333333,
"grad_norm": 1.4331884384155273,
"kl": 0.028350830078125,
"learning_rate": 4.75e-06,
"loss": -0.1062,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.021386535838246346,
"mask/share_reasoning": 0.7441344857215881,
"mask/share_step_conf": 0.20322901010513306,
"num_tokens": 6427448.0,
"reward": 0.5562363862991333,
"reward_std": 0.06482435017824173,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7534843683242798,
"rewards/format_reward_step": 0.96484375,
"rewards/step_margin_reward": 0.02539462223649025,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.7670572996139526,
"adv/mean_abs_reasoning": 0.3465600609779358,
"adv/mean_abs_step_conf": 0.6293857097625732,
"adv/ratio_final_to_reasoning": 2.2133459275412246,
"adv/ratio_step_to_reasoning": 1.8160941800002854,
"adv/std_final_conf": 0.9293149709701538,
"adv/std_reasoning": 0.6187368631362915,
"adv/std_step_conf": 0.8505994081497192,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 15.0859375,
"calib/ece": 0.16473895582329326,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.01606425702811245,
"calib/gap": -0.034671428571428575,
"calib/mean_conf": 0.6607228915662651,
"calib/mu_c": 0.6539,
"calib/mu_w": 0.6885714285714286,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.011124497991967871,
"calib/std_conf": 0.07473298867607345,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5910108168593808,
"calib/step_q_c_n": 2681.0,
"calib/step_q_gap": -0.034198666629188135,
"calib/step_q_w": 0.625209483488569,
"calib/step_q_w_n": 1181.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2967.0,
"completions/max_terminated_length": 2967.0,
"completions/mean_length": 776.60546875,
"completions/mean_terminated_length": 792.0757446289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 207.0,
"epoch": 0.0224,
"grad_norm": 0.31129884719848633,
"kl": 0.020071029663085938,
"learning_rate": 5e-06,
"loss": -0.1007,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.021010365337133408,
"mask/share_reasoning": 0.7523726224899292,
"mask/share_step_conf": 0.2070857584476471,
"num_tokens": 6729219.0,
"reward": 0.5802181363105774,
"reward_std": 0.08217073231935501,
"rewards/accuracy_reward_step": 0.78125,
"rewards/final_brier_reward_step": 0.7830750346183777,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.02658003382384777,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.7307391166687012,
"adv/mean_abs_reasoning": 0.18713341653347015,
"adv/mean_abs_step_conf": 0.6073583364486694,
"adv/ratio_final_to_reasoning": 3.904909824259011,
"adv/ratio_step_to_reasoning": 3.2455899523431135,
"adv/std_final_conf": 0.9273250699043274,
"adv/std_reasoning": 0.4675096869468689,
"adv/std_step_conf": 0.8511538505554199,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 12.73828125,
"calib/ece": 0.11175781250000003,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.00390625,
"calib/gap": 0.0027857142857141692,
"calib/mean_conf": 0.6611328125,
"calib/mu_c": 0.6617857142857142,
"calib/mu_w": 0.659,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.003632812499999997,
"calib/std_conf": 0.058429026269824526,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.590171009771987,
"calib/step_q_c_n": 2456.0,
"calib/step_q_gap": 0.0005436805794403199,
"calib/step_q_w": 0.5896273291925467,
"calib/step_q_w_n": 805.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2074.0,
"completions/max_terminated_length": 2074.0,
"completions/mean_length": 750.03515625,
"completions/mean_terminated_length": 755.94091796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 279.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.27753156423568726,
"kl": 0.022693634033203125,
"learning_rate": 4.9722222222222224e-06,
"loss": -0.0472,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02132297307252884,
"mask/share_reasoning": 0.7679464817047119,
"mask/share_step_conf": 0.20291808247566223,
"num_tokens": 7023044.0,
"reward": 0.5980198979377747,
"reward_std": 0.04576648771762848,
"rewards/accuracy_reward_step": 0.765625,
"rewards/final_brier_reward_step": 0.8072237968444824,
"rewards/format_reward_step": 1.0,
"rewards/step_margin_reward": 0.035690926015377045,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.7114725112915039,
"adv/mean_abs_reasoning": 0.31436145305633545,
"adv/mean_abs_step_conf": 0.6340088844299316,
"adv/ratio_final_to_reasoning": 2.2632307631050548,
"adv/ratio_step_to_reasoning": 2.0168149697294897,
"adv/std_final_conf": 0.9128880500793457,
"adv/std_reasoning": 0.6185219883918762,
"adv/std_step_conf": 0.8657233715057373,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 15.53515625,
"calib/ece": 0.14125984251968499,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.011811023622047244,
"calib/gap": -0.01323305407463815,
"calib/mean_conf": 0.6806299212598425,
"calib/mu_c": 0.6779207920792079,
"calib/mu_w": 0.691153846153846,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.01330708661417321,
"calib/std_conf": 0.09799363655618468,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6040825998645905,
"calib/step_q_c_n": 2954.0,
"calib/step_q_gap": -0.004013196811851416,
"calib/step_q_w": 0.6080957966764419,
"calib/step_q_w_n": 1023.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2538.0,
"completions/max_terminated_length": 2538.0,
"completions/mean_length": 878.9140625,
"completions/mean_terminated_length": 889.3359985351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 312.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.3237517178058624,
"kl": 0.019628524780273438,
"learning_rate": 4.944444444444445e-06,
"loss": -0.0817,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.019940588623285294,
"mask/share_reasoning": 0.7632123827934265,
"mask/share_step_conf": 0.2051282525062561,
"num_tokens": 7351982.0,
"reward": 0.5972261428833008,
"reward_std": 0.07706841081380844,
"rewards/accuracy_reward_step": 0.7890625,
"rewards/final_brier_reward_step": 0.8038030862808228,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.03439916670322418,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.7458991408348083,
"adv/mean_abs_reasoning": 0.3374817967414856,
"adv/mean_abs_step_conf": 0.6277369260787964,
"adv/ratio_final_to_reasoning": 2.210190736320438,
"adv/ratio_step_to_reasoning": 1.8600615859576246,
"adv/std_final_conf": 0.9292863011360168,
"adv/std_reasoning": 0.6185944676399231,
"adv/std_step_conf": 0.8499169945716858,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 14.07421875,
"calib/ece": 0.06044143426294826,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.01195219123505976,
"calib/gap": 0.0008164342357123155,
"calib/mean_conf": 0.667373705179283,
"calib/mu_c": 0.6676729559748428,
"calib/mu_w": 0.6668565217391305,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04717450199203192,
"calib/std_conf": 0.08415062823820862,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.591605249873801,
"calib/step_q_c_n": 1981.0,
"calib/step_q_gap": -0.015375452962203884,
"calib/step_q_w": 0.6069807028360049,
"calib/step_q_w_n": 1622.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2888.0,
"completions/max_terminated_length": 2888.0,
"completions/mean_length": 811.828125,
"completions/mean_terminated_length": 821.45458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 251.0,
"epoch": 0.0256,
"grad_norm": 0.33109045028686523,
"kl": 0.024871826171875,
"learning_rate": 4.9166666666666665e-06,
"loss": -0.0978,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.02108951471745968,
"mask/share_reasoning": 0.7626346349716187,
"mask/share_step_conf": 0.20455709099769592,
"num_tokens": 7664322.0,
"reward": 0.5434887409210205,
"reward_std": 0.08392756432294846,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7451183199882507,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.021546650677919388,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.707527756690979,
"adv/mean_abs_reasoning": 0.19326797127723694,
"adv/mean_abs_step_conf": 0.5437524914741516,
"adv/ratio_final_to_reasoning": 3.660863991147567,
"adv/ratio_step_to_reasoning": 2.8134640617412776,
"adv/std_final_conf": 0.927862823009491,
"adv/std_reasoning": 0.4959583878517151,
"adv/std_step_conf": 0.8003897666931152,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 14.15234375,
"calib/ece": 0.1054183266932271,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.023243804832999815,
"calib/mean_conf": 0.6718725099601593,
"calib/mu_c": 0.6651123595505618,
"calib/mu_w": 0.6883561643835616,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03406374501992032,
"calib/std_conf": 0.056836240624110364,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5857778776978417,
"calib/step_q_c_n": 2224.0,
"calib/step_q_gap": -0.036304323874710054,
"calib/step_q_w": 0.6220822015725518,
"calib/step_q_w_n": 1399.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2726.0,
"completions/max_terminated_length": 2726.0,
"completions/mean_length": 801.66796875,
"completions/mean_terminated_length": 811.1739501953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 226.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.1912597119808197,
"kl": 0.023283004760742188,
"learning_rate": 4.888888888888889e-06,
"loss": -0.0584,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.01987135224044323,
"mask/share_reasoning": 0.7582444548606873,
"mask/share_step_conf": 0.21016541123390198,
"num_tokens": 7972773.0,
"reward": 0.5612221956253052,
"reward_std": 0.057776253670454025,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.7643148899078369,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.022973205894231796,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.7434746026992798,
"adv/mean_abs_reasoning": 0.3130928874015808,
"adv/mean_abs_step_conf": 0.5985824465751648,
"adv/ratio_final_to_reasoning": 2.374613517635361,
"adv/ratio_step_to_reasoning": 1.9118366167399004,
"adv/std_final_conf": 0.9266886115074158,
"adv/std_reasoning": 0.596113920211792,
"adv/std_step_conf": 0.8195051550865173,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 14.53515625,
"calib/ece": 0.14855502008032126,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.006396436122017457,
"calib/mean_conf": 0.657791967871486,
"calib/mu_c": 0.6558139534883721,
"calib/mu_w": 0.6622103896103896,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05779196787148594,
"calib/std_conf": 0.07257252414506551,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5822783018867925,
"calib/step_q_c_n": 2120.0,
"calib/step_q_gap": -0.03643306600827312,
"calib/step_q_w": 0.6187113678950656,
"calib/step_q_w_n": 1601.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2826.0,
"completions/max_terminated_length": 2826.0,
"completions/mean_length": 760.25390625,
"completions/mean_terminated_length": 775.3984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 411.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.3038398325443268,
"kl": 0.026044845581054688,
"learning_rate": 4.861111111111111e-06,
"loss": -0.0717,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.019195556640625,
"mask/share_reasoning": 0.758608341217041,
"mask/share_step_conf": 0.2026648223400116,
"num_tokens": 8272638.0,
"reward": 0.5579166412353516,
"reward_std": 0.07703261822462082,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.756049633026123,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.030877456068992615,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.7445192933082581,
"adv/mean_abs_reasoning": 0.31904906034469604,
"adv/mean_abs_step_conf": 0.5985069274902344,
"adv/ratio_final_to_reasoning": 2.3335573924082085,
"adv/ratio_step_to_reasoning": 1.8759087610025118,
"adv/std_final_conf": 0.9297001361846924,
"adv/std_reasoning": 0.6184704899787903,
"adv/std_step_conf": 0.8344622850418091,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 13.73828125,
"calib/ece": 0.09807843137254894,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.00392156862745098,
"calib/gap": -0.016436713836477934,
"calib/mean_conf": 0.6729803921568628,
"calib/mu_c": 0.6667924528301887,
"calib/mu_w": 0.6832291666666667,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07376470588235291,
"calib/std_conf": 0.06798889476679937,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.58775,
"calib/step_q_c_n": 2000.0,
"calib/step_q_gap": -0.016983025708635413,
"calib/step_q_w": 0.6047330257086354,
"calib/step_q_w_n": 1517.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2535.0,
"completions/max_terminated_length": 2535.0,
"completions/mean_length": 833.25,
"completions/mean_terminated_length": 839.81103515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 307.0,
"epoch": 0.0288,
"grad_norm": 0.3839709460735321,
"kl": 0.03032684326171875,
"learning_rate": 4.833333333333333e-06,
"loss": -0.0129,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.01979503408074379,
"mask/share_reasoning": 0.7735280990600586,
"mask/share_step_conf": 0.19886434078216553,
"num_tokens": 8591166.0,
"reward": 0.5505564212799072,
"reward_std": 0.07114443182945251,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7475433945655823,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.03013189509510994,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.7383830547332764,
"adv/mean_abs_reasoning": 0.15154343843460083,
"adv/mean_abs_step_conf": 0.505989670753479,
"adv/ratio_final_to_reasoning": 4.8724185115538905,
"adv/ratio_step_to_reasoning": 3.338908473901632,
"adv/std_final_conf": 0.9208484292030334,
"adv/std_reasoning": 0.40508344769477844,
"adv/std_step_conf": 0.7664840817451477,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 15.31640625,
"calib/ece": 0.20844563492063484,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.01984126984126984,
"calib/gap": -0.02344361702127673,
"calib/mean_conf": 0.6589353174603175,
"calib/mu_c": 0.6529813829787233,
"calib/mu_w": 0.676425,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.06067460317460317,
"calib/std_conf": 0.11771983312008158,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5964008502024291,
"calib/step_q_c_n": 2470.0,
"calib/step_q_gap": -0.01219032829515887,
"calib/step_q_w": 0.6085911784975879,
"calib/step_q_w_n": 1451.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2924.0,
"completions/max_terminated_length": 2924.0,
"completions/mean_length": 883.35546875,
"completions/mean_terminated_length": 893.830078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 345.0,
"epoch": 0.029866666666666666,
"grad_norm": 2.493471622467041,
"kl": 0.038482666015625,
"learning_rate": 4.805555555555556e-06,
"loss": -0.0604,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.019157718867063522,
"mask/share_reasoning": 0.7706682682037354,
"mask/share_step_conf": 0.19845527410507202,
"num_tokens": 8924249.0,
"reward": 0.5710136890411377,
"reward_std": 0.06168290600180626,
"rewards/accuracy_reward_step": 0.734375,
"rewards/final_brier_reward_step": 0.7680135369300842,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.03026391752064228,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.7686499357223511,
"adv/mean_abs_reasoning": 0.3789054751396179,
"adv/mean_abs_step_conf": 0.6669013500213623,
"adv/ratio_final_to_reasoning": 2.028606040699521,
"adv/ratio_step_to_reasoning": 1.7600731416605278,
"adv/std_final_conf": 0.9301630854606628,
"adv/std_reasoning": 0.6613036394119263,
"adv/std_step_conf": 0.8817446231842041,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 17.3125,
"calib/ece": 0.07533306772908363,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.0199203187250996,
"calib/gap": -0.01679394573531534,
"calib/mean_conf": 0.6844278884462152,
"calib/mu_c": 0.6789414201183432,
"calib/mu_w": 0.6957353658536586,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04322709163346618,
"calib/std_conf": 0.122758852859182,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6094287381473377,
"calib/step_q_c_n": 2742.0,
"calib/step_q_gap": -0.00011741569881607017,
"calib/step_q_w": 0.6095461538461537,
"calib/step_q_w_n": 1690.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2853.0,
"completions/max_terminated_length": 2853.0,
"completions/mean_length": 988.06640625,
"completions/mean_terminated_length": 1003.7500610351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 338.0,
"epoch": 0.030933333333333334,
"grad_norm": 20.593656539916992,
"kl": 0.20651626586914062,
"learning_rate": 4.777777777777778e-06,
"loss": -0.0883,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.016685109585523605,
"mask/share_reasoning": 0.7725826501846313,
"mask/share_step_conf": 0.19510728120803833,
"num_tokens": 9284322.0,
"reward": 0.5559663772583008,
"reward_std": 0.09881217032670975,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7426596283912659,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.041148170828819275,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.7729416489601135,
"adv/mean_abs_reasoning": 0.3953835964202881,
"adv/mean_abs_step_conf": 0.6607516407966614,
"adv/ratio_final_to_reasoning": 1.9549158234133863,
"adv/ratio_step_to_reasoning": 1.6711660442642395,
"adv/std_final_conf": 0.9272094368934631,
"adv/std_reasoning": 0.6613385677337646,
"adv/std_step_conf": 0.8670967817306519,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 15.7734375,
"calib/ece": 0.06759196787148586,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.008032128514056224,
"calib/gap": -0.005014390021296,
"calib/mean_conf": 0.697310843373494,
"calib/mu_c": 0.6957803468208092,
"calib/mu_w": 0.7007947368421052,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03506184738955824,
"calib/std_conf": 0.08357829459976067,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6052986811481769,
"calib/step_q_c_n": 2578.0,
"calib/step_q_gap": -0.002377277755932772,
"calib/step_q_w": 0.6076759589041096,
"calib/step_q_w_n": 1460.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2843.0,
"completions/max_terminated_length": 2843.0,
"completions/mean_length": 943.890625,
"completions/mean_terminated_length": 962.6932373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 388.0,
"epoch": 0.032,
"grad_norm": 23.295793533325195,
"kl": 0.13172149658203125,
"learning_rate": 4.75e-06,
"loss": -0.0768,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.016622882336378098,
"mask/share_reasoning": 0.765550434589386,
"mask/share_step_conf": 0.1982954442501068,
"num_tokens": 9632942.0,
"reward": 0.5647997260093689,
"reward_std": 0.10322060436010361,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7575246095657349,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.04238741099834442,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.7606886625289917,
"adv/mean_abs_reasoning": 0.40039771795272827,
"adv/mean_abs_step_conf": 0.6468129754066467,
"adv/ratio_final_to_reasoning": 1.8998326624299093,
"adv/ratio_step_to_reasoning": 1.6154262284856746,
"adv/std_final_conf": 0.9288567304611206,
"adv/std_reasoning": 0.6613115072250366,
"adv/std_step_conf": 0.8345862030982971,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 16.53125,
"calib/ece": 0.0926899598393575,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.012048192771084338,
"calib/gap": 0.00555919241573033,
"calib/mean_conf": 0.6913261044176706,
"calib/mu_c": 0.6933131250000001,
"calib/mu_w": 0.6877539325842698,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0707228915662651,
"calib/std_conf": 0.10581848025908569,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6108364389233955,
"calib/step_q_c_n": 2415.0,
"calib/step_q_gap": -0.018585080063946213,
"calib/step_q_w": 0.6294215189873417,
"calib/step_q_w_n": 1817.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2523.0,
"completions/max_terminated_length": 2523.0,
"completions/mean_length": 919.84765625,
"completions/mean_terminated_length": 945.706787109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 364.0,
"epoch": 0.03306666666666667,
"grad_norm": 77.55889892578125,
"kl": 0.10741424560546875,
"learning_rate": 4.722222222222222e-06,
"loss": -0.0953,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.017139755189418793,
"mask/share_reasoning": 0.7630516290664673,
"mask/share_step_conf": 0.1924648880958557,
"num_tokens": 9974335.0,
"reward": 0.5456604957580566,
"reward_std": 0.09380966424942017,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.738542914390564,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.033246852457523346,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.7483454942703247,
"adv/mean_abs_reasoning": 0.3946504294872284,
"adv/mean_abs_step_conf": 0.591101884841919,
"adv/ratio_final_to_reasoning": 1.896223691540522,
"adv/ratio_step_to_reasoning": 1.497785991541783,
"adv/std_final_conf": 0.9297779202461243,
"adv/std_reasoning": 0.6815653443336487,
"adv/std_step_conf": 0.832304060459137,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 16.9609375,
"calib/ece": 0.15730119047619043,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.015873015873015872,
"calib/gap": -0.043936311514572335,
"calib/mean_conf": 0.6955559523809524,
"calib/mu_c": 0.6796900621118013,
"calib/mu_w": 0.7236263736263736,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10698412698412699,
"calib/std_conf": 0.07975371616637646,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6061186650185414,
"calib/step_q_c_n": 2427.0,
"calib/step_q_gap": -0.008288489028456025,
"calib/step_q_w": 0.6144071540469974,
"calib/step_q_w_n": 1915.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2522.0,
"completions/max_terminated_length": 2522.0,
"completions/mean_length": 909.0703125,
"completions/mean_terminated_length": 916.2283325195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.46288660168647766,
"kl": 0.03807830810546875,
"learning_rate": 4.694444444444445e-06,
"loss": -0.0434,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.01737382635474205,
"mask/share_reasoning": 0.7727394104003906,
"mask/share_step_conf": 0.20207428932189941,
"num_tokens": 10313761.0,
"reward": 0.5478274822235107,
"reward_std": 0.09904163330793381,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7278913855552673,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.045107267796993256,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.7612197995185852,
"adv/mean_abs_reasoning": 0.37524527311325073,
"adv/mean_abs_step_conf": 0.6346988677978516,
"adv/ratio_final_to_reasoning": 2.0285926407628474,
"adv/ratio_step_to_reasoning": 1.6914240185679743,
"adv/std_final_conf": 0.9312652945518494,
"adv/std_reasoning": 0.6612553000450134,
"adv/std_step_conf": 0.85466468334198,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 16.16015625,
"calib/ece": 0.10518458498023714,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.03162055335968379,
"calib/gap": 0.027722876498175952,
"calib/mean_conf": 0.693801185770751,
"calib/mu_c": 0.7048684210526316,
"calib/mu_w": 0.6771455445544556,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09909762845849801,
"calib/std_conf": 0.13993269485543056,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6107536945812807,
"calib/step_q_c_n": 2030.0,
"calib/step_q_gap": -0.03945717395217907,
"calib/step_q_w": 0.6502108685334598,
"calib/step_q_w_n": 2107.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2523.0,
"completions/max_terminated_length": 2523.0,
"completions/mean_length": 864.5703125,
"completions/mean_terminated_length": 871.3779296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 308.0,
"epoch": 0.0352,
"grad_norm": 0.9508217573165894,
"kl": 0.06719207763671875,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0206,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.01880577951669693,
"mask/share_reasoning": 0.7738245129585266,
"mask/share_step_conf": 0.19955721497535706,
"num_tokens": 10641963.0,
"reward": 0.5473966002464294,
"reward_std": 0.10497646033763885,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7364916801452637,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.04189526289701462,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.753960371017456,
"adv/mean_abs_reasoning": 0.4716504216194153,
"adv/mean_abs_step_conf": 0.6549442410469055,
"adv/ratio_final_to_reasoning": 1.5985576105894859,
"adv/ratio_step_to_reasoning": 1.3886221892861867,
"adv/std_final_conf": 0.9314026832580566,
"adv/std_reasoning": 0.7206087708473206,
"adv/std_step_conf": 0.8729121685028076,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 15.7265625,
"calib/ece": 0.15539529411764708,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.11764705882352941,
"calib/gap": 0.06722285351755264,
"calib/mean_conf": 0.7493890196078431,
"calib/mu_c": 0.7710057803468209,
"calib/mu_w": 0.7037829268292682,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1131764705882353,
"calib/std_conf": 0.23408845495277225,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6559398144712429,
"calib/step_q_c_n": 2695.0,
"calib/step_q_gap": 0.018178131526088936,
"calib/step_q_w": 0.637761682945154,
"calib/step_q_w_n": 1331.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2105.0,
"completions/max_terminated_length": 2105.0,
"completions/mean_length": 820.75,
"completions/mean_terminated_length": 827.2125854492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 389.0,
"epoch": 0.03626666666666667,
"grad_norm": 1.8004989624023438,
"kl": 0.1200408935546875,
"learning_rate": 4.638888888888889e-06,
"loss": 0.0104,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.018822304904460907,
"mask/share_reasoning": 0.7638715505599976,
"mask/share_step_conf": 0.20949357748031616,
"num_tokens": 10957187.0,
"reward": 0.5783343315124512,
"reward_std": 0.14782200753688812,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.748401403427124,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.07389220595359802,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.7771700620651245,
"adv/mean_abs_reasoning": 0.39579248428344727,
"adv/mean_abs_step_conf": 0.5836151838302612,
"adv/ratio_final_to_reasoning": 1.9635796356066055,
"adv/ratio_step_to_reasoning": 1.4745484237449655,
"adv/std_final_conf": 0.9326081275939941,
"adv/std_reasoning": 0.6815245747566223,
"adv/std_step_conf": 0.8389482498168945,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 17.5078125,
"calib/ece": 0.27989444444444433,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.1984126984126984,
"calib/gap": -0.0006252383850678944,
"calib/mean_conf": 0.7447087301587302,
"calib/mu_c": 0.7444779874213836,
"calib/mu_w": 0.7451032258064515,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19682539682539676,
"calib/std_conf": 0.26671001448866943,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.657295349777598,
"calib/step_q_c_n": 2473.0,
"calib/step_q_gap": -0.004080707962571206,
"calib/step_q_w": 0.6613760577401692,
"calib/step_q_w_n": 2009.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2301.0,
"completions/max_terminated_length": 2301.0,
"completions/mean_length": 963.65234375,
"completions/mean_terminated_length": 975.0791015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 325.0,
"epoch": 0.037333333333333336,
"grad_norm": 2.1660313606262207,
"kl": 0.1023101806640625,
"learning_rate": 4.611111111111112e-06,
"loss": -0.0413,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.016110636293888092,
"mask/share_reasoning": 0.7811132669448853,
"mask/share_step_conf": 0.19105733931064606,
"num_tokens": 11313138.0,
"reward": 0.5301204919815063,
"reward_std": 0.15779337286949158,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.6721141338348389,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.06703317910432816,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.7380247116088867,
"adv/mean_abs_reasoning": 0.2726476490497589,
"adv/mean_abs_step_conf": 0.647517740726471,
"adv/ratio_final_to_reasoning": 2.706880892540523,
"adv/ratio_step_to_reasoning": 2.3749250836499867,
"adv/std_final_conf": 0.9322901964187622,
"adv/std_reasoning": 0.5728190541267395,
"adv/std_step_conf": 0.8712092041969299,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 17.12109375,
"calib/ece": 0.13891155378486053,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.1593625498007968,
"calib/gap": 0.11752367346938775,
"calib/mean_conf": 0.7103314741035857,
"calib/mu_c": 0.7360836734693877,
"calib/mu_w": 0.61856,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.034183266932270896,
"calib/std_conf": 0.2845890761017935,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6628022576361222,
"calib/step_q_c_n": 3012.0,
"calib/step_q_gap": 0.016343906067923797,
"calib/step_q_w": 0.6464583515681984,
"calib/step_q_w_n": 1371.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2780.0,
"completions/max_terminated_length": 2780.0,
"completions/mean_length": 851.8046875,
"completions/mean_terminated_length": 872.248046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 384.0,
"epoch": 0.0384,
"grad_norm": 1.1923136711120605,
"kl": 0.125030517578125,
"learning_rate": 4.583333333333333e-06,
"loss": -0.0812,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.018202677369117737,
"mask/share_reasoning": 0.7517971396446228,
"mask/share_step_conf": 0.20656266808509827,
"num_tokens": 11633912.0,
"reward": 0.5910356044769287,
"reward_std": 0.1539851725101471,
"rewards/accuracy_reward_step": 0.765625,
"rewards/final_brier_reward_step": 0.7678468823432922,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.0650056004524231,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.7172503471374512,
"adv/mean_abs_reasoning": 0.4631299674510956,
"adv/mean_abs_step_conf": 0.5714775323867798,
"adv/ratio_final_to_reasoning": 1.5487020869863912,
"adv/ratio_step_to_reasoning": 1.2339463488661533,
"adv/std_final_conf": 0.9181895852088928,
"adv/std_reasoning": 0.7393516302108765,
"adv/std_step_conf": 0.8393474817276001,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 16.50390625,
"calib/ece": 0.20361517857142847,
"calib/final_conf_rate": 0.875,
"calib/format_rate": 0.875,
"calib/frac_conf_gt_0.9": 0.17410714285714285,
"calib/gap": 0.03333103686437,
"calib/mean_conf": 0.7635276785714286,
"calib/mu_c": 0.7755804195804195,
"calib/mu_w": 0.7422493827160495,
"calib/nonempty_final_conf_rate": 0.875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1643749999999999,
"calib/std_conf": 0.24160783129818242,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6688296014330497,
"calib/step_q_c_n": 2233.0,
"calib/step_q_gap": 0.03221454119208578,
"calib/step_q_w": 0.6366150602409639,
"calib/step_q_w_n": 1992.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2686.0,
"completions/max_terminated_length": 2686.0,
"completions/mean_length": 849.45703125,
"completions/mean_terminated_length": 862.9405517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 335.0,
"epoch": 0.039466666666666664,
"grad_norm": 3.8515422344207764,
"kl": 0.1292877197265625,
"learning_rate": 4.555555555555556e-06,
"loss": -0.1164,
"mask/has_final_conf_rate": 0.875,
"mask/share_final_conf": 0.017217468470335007,
"mask/share_reasoning": 0.7670263648033142,
"mask/share_step_conf": 0.2001311331987381,
"num_tokens": 11958469.0,
"reward": 0.4886449873447418,
"reward_std": 0.14419779181480408,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6216946840286255,
"rewards/format_reward_step": 0.875,
"rewards/step_margin_reward": 0.06106396019458771,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.8081904649734497,
"adv/mean_abs_reasoning": 0.7108105421066284,
"adv/mean_abs_step_conf": 0.5965287089347839,
"adv/ratio_final_to_reasoning": 1.136998422361909,
"adv/ratio_step_to_reasoning": 0.8392232157486754,
"adv/std_final_conf": 0.9355867505073547,
"adv/std_reasoning": 0.8904671669006348,
"adv/std_step_conf": 0.8587121367454529,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 20.08984375,
"calib/ece": 0.3537238888888889,
"calib/final_conf_rate": 0.703125,
"calib/format_rate": 0.703125,
"calib/frac_conf_gt_0.9": 0.32222222222222224,
"calib/gap": 0.1414482142857143,
"calib/mean_conf": 0.5383872222222224,
"calib/mu_c": 0.5918232142857143,
"calib/mu_w": 0.45037499999999997,
"calib/nonempty_final_conf_rate": 0.703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13494444444444448,
"calib/std_conf": 0.4416079143350978,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6499626506024097,
"calib/step_q_c_n": 2573.0,
"calib/step_q_gap": 0.15248167783976374,
"calib/step_q_w": 0.4974809727626459,
"calib/step_q_w_n": 2570.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3009.0,
"completions/max_terminated_length": 3009.0,
"completions/mean_length": 922.26953125,
"completions/mean_terminated_length": 952.0201416015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 377.0,
"epoch": 0.04053333333333333,
"grad_norm": 13.913761138916016,
"kl": 0.1562652587890625,
"learning_rate": 4.527777777777778e-06,
"loss": -0.2601,
"mask/has_final_conf_rate": 0.703125,
"mask/share_final_conf": 0.012419766746461391,
"mask/share_reasoning": 0.7490566968917847,
"mask/share_step_conf": 0.20727355778217316,
"num_tokens": 12301458.0,
"reward": 0.3639047145843506,
"reward_std": 0.27264875173568726,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.4425402283668518,
"rewards/format_reward_step": 0.703125,
"rewards/step_margin_reward": 0.023550385609269142,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.7847657799720764,
"adv/mean_abs_reasoning": 0.34654319286346436,
"adv/mean_abs_step_conf": 0.6521954536437988,
"adv/ratio_final_to_reasoning": 2.264554018469117,
"adv/ratio_step_to_reasoning": 1.8820033608357714,
"adv/std_final_conf": 0.9348078966140747,
"adv/std_reasoning": 0.640278697013855,
"adv/std_step_conf": 0.8904418349266052,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 16.54296875,
"calib/ece": 0.4314366533864542,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.3147410358565737,
"calib/gap": 0.1220060851648353,
"calib/mean_conf": 0.39740796812749,
"calib/mu_c": 0.4416412500000001,
"calib/mu_w": 0.3196351648351648,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09569721115537849,
"calib/std_conf": 0.45804881324482766,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6229245873889124,
"calib/step_q_c_n": 2363.0,
"calib/step_q_gap": 0.10884029251711758,
"calib/step_q_w": 0.5140842948717949,
"calib/step_q_w_n": 1872.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 3018.0,
"completions/max_terminated_length": 3018.0,
"completions/mean_length": 879.80078125,
"completions/mean_terminated_length": 890.2332153320312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 363.0,
"epoch": 0.0416,
"grad_norm": 2.39874267578125,
"kl": 0.152252197265625,
"learning_rate": 4.5e-06,
"loss": -0.0102,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.018277311697602272,
"mask/share_reasoning": 0.7705078125,
"mask/share_step_conf": 0.19949612021446228,
"num_tokens": 12632775.0,
"reward": 0.42221030592918396,
"reward_std": 0.2368159294128418,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.5469609498977661,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": -0.023634128272533417,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.7526578307151794,
"adv/mean_abs_reasoning": 0.359994113445282,
"adv/mean_abs_step_conf": 0.6284192800521851,
"adv/ratio_final_to_reasoning": 2.0907503834214256,
"adv/ratio_step_to_reasoning": 1.7456376551215549,
"adv/std_final_conf": 0.9165927171707153,
"adv/std_reasoning": 0.6402466893196106,
"adv/std_step_conf": 0.8590297102928162,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 18.49609375,
"calib/ece": 0.6018401606425703,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.20080321285140562,
"calib/gap": -0.12125155160628848,
"calib/mean_conf": 0.23783855421686748,
"calib/mu_c": 0.19157792207792207,
"calib/mu_w": 0.31282947368421055,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11060240963855418,
"calib/std_conf": 0.39857486560681676,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5107157802042973,
"calib/step_q_c_n": 2839.0,
"calib/step_q_gap": 0.027086455309782542,
"calib/step_q_w": 0.48362932489451477,
"calib/step_q_w_n": 1896.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2966.0,
"completions/max_terminated_length": 2966.0,
"completions/mean_length": 946.015625,
"completions/mean_terminated_length": 957.2332153320312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 345.0,
"epoch": 0.042666666666666665,
"grad_norm": 1.3916919231414795,
"kl": 0.1547088623046875,
"learning_rate": 4.472222222222223e-06,
"loss": -0.0372,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.016756486147642136,
"mask/share_reasoning": 0.7632251977920532,
"mask/share_step_conf": 0.20829957723617554,
"num_tokens": 12981715.0,
"reward": 0.31116634607315063,
"reward_std": 0.21380087733268738,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.3920474648475647,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": -0.08846484124660492,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.7882683277130127,
"adv/mean_abs_reasoning": 0.29561924934387207,
"adv/mean_abs_step_conf": 0.6299017667770386,
"adv/ratio_final_to_reasoning": 2.6664986446673447,
"adv/ratio_step_to_reasoning": 2.130787383349047,
"adv/std_final_conf": 0.930815577507019,
"adv/std_reasoning": 0.5726768374443054,
"adv/std_step_conf": 0.8430304527282715,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 16.1875,
"calib/ece": 0.4181775590551181,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3228346456692913,
"calib/gap": 0.17358806348506556,
"calib/mean_conf": 0.4596964566929133,
"calib/mu_c": 0.4890834123222748,
"calib/mu_w": 0.31549534883720926,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.02358267716535427,
"calib/std_conf": 0.45717941819434066,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6162898284313725,
"calib/step_q_c_n": 3264.0,
"calib/step_q_gap": 0.06190641934046337,
"calib/step_q_w": 0.5543834090909091,
"calib/step_q_w_n": 880.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1843.0,
"completions/max_terminated_length": 1843.0,
"completions/mean_length": 831.1953125,
"completions/mean_terminated_length": 837.7401733398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 386.0,
"epoch": 0.04373333333333333,
"grad_norm": 4.049104690551758,
"kl": 0.1920166015625,
"learning_rate": 4.444444444444444e-06,
"loss": 0.0009,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.018943600356578827,
"mask/share_reasoning": 0.7585512399673462,
"mask/share_step_conf": 0.2146926373243332,
"num_tokens": 13301749.0,
"reward": 0.45164954662323,
"reward_std": 0.23738747835159302,
"rewards/accuracy_reward_step": 0.828125,
"rewards/final_brier_reward_step": 0.5571421384811401,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": -0.017905592918395996,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.7633222937583923,
"adv/mean_abs_reasoning": 0.2732507884502411,
"adv/mean_abs_step_conf": 0.5937932729721069,
"adv/ratio_final_to_reasoning": 2.7934861527303267,
"adv/ratio_step_to_reasoning": 2.1730706664739836,
"adv/std_final_conf": 0.9305040836334229,
"adv/std_reasoning": 0.5726051926612854,
"adv/std_step_conf": 0.840979814529419,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 15.49609375,
"calib/ece": 0.3167365079365079,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.4007936507936508,
"calib/gap": 0.004208311229001005,
"calib/mean_conf": 0.6858031746031745,
"calib/mu_c": 0.6871057471264368,
"calib/mu_w": 0.6828974358974358,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15603174603174602,
"calib/std_conf": 0.375200081318845,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6613030564545127,
"calib/step_q_c_n": 2781.0,
"calib/step_q_gap": 0.02169175797221934,
"calib/step_q_w": 0.6396112984822934,
"calib/step_q_w_n": 1186.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2203.0,
"completions/max_terminated_length": 2203.0,
"completions/mean_length": 765.09765625,
"completions/mean_terminated_length": 777.2421264648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 406.0,
"epoch": 0.0448,
"grad_norm": 249.845458984375,
"kl": 8.14959716796875,
"learning_rate": 4.416666666666667e-06,
"loss": 0.0797,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.0191387627273798,
"mask/share_reasoning": 0.7546572089195251,
"mask/share_step_conf": 0.2105790227651596,
"num_tokens": 13601982.0,
"reward": 0.5088764429092407,
"reward_std": 0.18659836053848267,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.6371692419052124,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.04542739316821098,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.7796351909637451,
"adv/mean_abs_reasoning": 0.3929508924484253,
"adv/mean_abs_step_conf": 0.6887763142585754,
"adv/ratio_final_to_reasoning": 1.9840524756310918,
"adv/ratio_step_to_reasoning": 1.7528305126549042,
"adv/std_final_conf": 0.9343047142028809,
"adv/std_reasoning": 0.6612555384635925,
"adv/std_step_conf": 0.9042664766311646,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 17.1484375,
"calib/ece": 0.25004566929133853,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.2677165354330709,
"calib/gap": 0.02996012250161173,
"calib/mean_conf": 0.6705055118110237,
"calib/mu_c": 0.6782904255319149,
"calib/mu_w": 0.6483303030303031,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09019685039370078,
"calib/std_conf": 0.34314449547205267,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6246479186834463,
"calib/step_q_c_n": 3099.0,
"calib/step_q_gap": 0.030246601874770795,
"calib/step_q_w": 0.5944013168086755,
"calib/step_q_w_n": 1291.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2403.0,
"completions/max_terminated_length": 2403.0,
"completions/mean_length": 878.5,
"completions/mean_terminated_length": 885.4172973632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 363.0,
"epoch": 0.04586666666666667,
"grad_norm": 1.552886724472046,
"kl": 0.198394775390625,
"learning_rate": 4.388888888888889e-06,
"loss": -0.0167,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.01802043803036213,
"mask/share_reasoning": 0.7663673162460327,
"mask/share_step_conf": 0.2077997326850891,
"num_tokens": 13932102.0,
"reward": 0.5526186227798462,
"reward_std": 0.169440358877182,
"rewards/accuracy_reward_step": 0.734375,
"rewards/final_brier_reward_step": 0.6911579370498657,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.06876671314239502,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.7641512155532837,
"adv/mean_abs_reasoning": 0.4402768015861511,
"adv/mean_abs_step_conf": 0.6271167993545532,
"adv/ratio_final_to_reasoning": 1.7356154419227525,
"adv/ratio_step_to_reasoning": 1.4243693901093315,
"adv/std_final_conf": 0.9358059167861938,
"adv/std_reasoning": 0.7206077575683594,
"adv/std_step_conf": 0.8419582843780518,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 18.640625,
"calib/ece": 0.18296857142857148,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.19591836734693877,
"calib/gap": 0.03705228070175448,
"calib/mean_conf": 0.7537661224489797,
"calib/mu_c": 0.7681333333333333,
"calib/mu_w": 0.7310810526315789,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16224489795918373,
"calib/std_conf": 0.1857926899203027,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.600721026802567,
"calib/step_q_c_n": 2649.0,
"calib/step_q_gap": -0.003705916202614401,
"calib/step_q_w": 0.6044269430051814,
"calib/step_q_w_n": 2123.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2293.0,
"completions/max_terminated_length": 2293.0,
"completions/mean_length": 859.12890625,
"completions/mean_terminated_length": 894.0527954101562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 292.0,
"epoch": 0.046933333333333334,
"grad_norm": 1.1150966882705688,
"kl": 0.16571044921875,
"learning_rate": 4.361111111111112e-06,
"loss": -0.1341,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.016945868730545044,
"mask/share_reasoning": 0.7451803684234619,
"mask/share_step_conf": 0.19881124794483185,
"num_tokens": 14258359.0,
"reward": 0.5474046468734741,
"reward_std": 0.16996341943740845,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6944642066955566,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.09018873423337936,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.756545901298523,
"adv/mean_abs_reasoning": 0.42394763231277466,
"adv/mean_abs_step_conf": 0.7025113105773926,
"adv/ratio_final_to_reasoning": 1.78452677556262,
"adv/ratio_step_to_reasoning": 1.6570709612056584,
"adv/std_final_conf": 0.935492992401123,
"adv/std_reasoning": 0.7015060782432556,
"adv/std_step_conf": 0.9042750000953674,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 18.265625,
"calib/ece": 0.11956734693877547,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.11020408163265306,
"calib/gap": 0.00505797101449279,
"calib/mean_conf": 0.7135755102040817,
"calib/mu_c": 0.715,
"calib/mu_w": 0.7099420289855072,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05738775510204078,
"calib/std_conf": 0.17422795123655282,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5407490272373541,
"calib/step_q_c_n": 3084.0,
"calib/step_q_gap": -0.02061278180787207,
"calib/step_q_w": 0.5613618090452261,
"calib/step_q_w_n": 1592.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 3049.0,
"completions/max_terminated_length": 3049.0,
"completions/mean_length": 859.15625,
"completions/mean_terminated_length": 890.4615478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 273.0,
"epoch": 0.048,
"grad_norm": 7.584715366363525,
"kl": 0.45855712890625,
"learning_rate": 4.333333333333334e-06,
"loss": -0.0599,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.01749027520418167,
"mask/share_reasoning": 0.7472882866859436,
"mask/share_step_conf": 0.20006519556045532,
"num_tokens": 14583351.0,
"reward": 0.5705052018165588,
"reward_std": 0.15893591940402985,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.7362944483757019,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.07346594333648682,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.7510257959365845,
"adv/mean_abs_reasoning": 0.5484082102775574,
"adv/mean_abs_step_conf": 0.7050764560699463,
"adv/ratio_final_to_reasoning": 1.369464901986933,
"adv/ratio_step_to_reasoning": 1.2856781551703917,
"adv/std_final_conf": 0.9359838366508484,
"adv/std_reasoning": 0.7928950190544128,
"adv/std_step_conf": 0.9049521088600159,
"calib/answer_extract_rate": 0.9453125,
"calib/avg_num_step_conf": 18.79296875,
"calib/ece": 0.12113692946058094,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.11618257261410789,
"calib/gap": 0.01165860576205413,
"calib/mean_conf": 0.7049211618257263,
"calib/mu_c": 0.7091298701298702,
"calib/mu_w": 0.697471264367816,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09352697095435689,
"calib/std_conf": 0.1649222060048462,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5419610073111292,
"calib/step_q_c_n": 2462.0,
"calib/step_q_gap": -0.0590053613563889,
"calib/step_q_w": 0.6009663686675181,
"calib/step_q_w_n": 2349.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2999.0,
"completions/max_terminated_length": 2999.0,
"completions/mean_length": 825.8671875,
"completions/mean_terminated_length": 873.6445922851562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 290.0,
"epoch": 0.04906666666666667,
"grad_norm": 1.147646188735962,
"kl": 0.178741455078125,
"learning_rate": 4.305555555555556e-06,
"loss": -0.1523,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.018486380577087402,
"mask/share_reasoning": 0.7309393882751465,
"mask/share_step_conf": 0.19588670134544373,
"num_tokens": 14899541.0,
"reward": 0.5333684086799622,
"reward_std": 0.16860918700695038,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6996122598648071,
"rewards/format_reward_step": 0.94140625,
"rewards/step_margin_reward": 0.058530814945697784,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.7626035809516907,
"adv/mean_abs_reasoning": 0.2406388521194458,
"adv/mean_abs_step_conf": 0.6646240949630737,
"adv/ratio_final_to_reasoning": 3.1690791999504615,
"adv/ratio_step_to_reasoning": 2.761915165025698,
"adv/std_final_conf": 0.9349892735481262,
"adv/std_reasoning": 0.5227157473564148,
"adv/std_step_conf": 0.8726968169212341,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 17.01171875,
"calib/ece": 0.12545816733067722,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.11553784860557768,
"calib/gap": 0.0007207207207207134,
"calib/mean_conf": 0.7364541832669322,
"calib/mu_c": 0.7366666666666666,
"calib/mu_w": 0.7359459459459459,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07836653386454182,
"calib/std_conf": 0.1407088517775483,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.520901875901876,
"calib/step_q_c_n": 2772.0,
"calib/step_q_gap": -0.04521941279048036,
"calib/step_q_w": 0.5661212886923563,
"calib/step_q_w_n": 1583.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1872.0,
"completions/max_terminated_length": 1872.0,
"completions/mean_length": 861.109375,
"completions/mean_terminated_length": 871.3201904296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 399.0,
"epoch": 0.050133333333333335,
"grad_norm": 2.5522289276123047,
"kl": 0.23779296875,
"learning_rate": 4.277777777777778e-06,
"loss": -0.0587,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.017462948337197304,
"mask/share_reasoning": 0.7714071273803711,
"mask/share_step_conf": 0.19941113889217377,
"num_tokens": 15225961.0,
"reward": 0.59224933385849,
"reward_std": 0.1048864796757698,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.7565503716468811,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.09201083332300186,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.7653645277023315,
"adv/mean_abs_reasoning": 0.5050486922264099,
"adv/mean_abs_step_conf": 0.6120218634605408,
"adv/ratio_final_to_reasoning": 1.5154272043124584,
"adv/ratio_step_to_reasoning": 1.2118076393041635,
"adv/std_final_conf": 0.9359065890312195,
"adv/std_reasoning": 0.7753804922103882,
"adv/std_step_conf": 0.8563839197158813,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 16.02734375,
"calib/ece": 0.2003212851405623,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.08835341365461848,
"calib/gap": -0.043063204451419645,
"calib/mean_conf": 0.6985542168674699,
"calib/mu_c": 0.683680981595092,
"calib/mu_w": 0.7267441860465117,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12212851405622488,
"calib/std_conf": 0.14105568011199188,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5007367475292004,
"calib/step_q_c_n": 2226.0,
"calib/step_q_gap": -0.08999846824064506,
"calib/step_q_w": 0.5907352157698454,
"calib/step_q_w_n": 1877.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2747.0,
"completions/max_terminated_length": 2747.0,
"completions/mean_length": 774.2265625,
"completions/mean_terminated_length": 789.6494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 270.0,
"epoch": 0.0512,
"grad_norm": 1.180692434310913,
"kl": 0.198089599609375,
"learning_rate": 4.25e-06,
"loss": -0.0692,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.02045733667910099,
"mask/share_reasoning": 0.7637689113616943,
"mask/share_step_conf": 0.19624248147010803,
"num_tokens": 15527851.0,
"reward": 0.5507139563560486,
"reward_std": 0.15430349111557007,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7125750184059143,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.066977858543396,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.7673805356025696,
"adv/mean_abs_reasoning": 0.40805861353874207,
"adv/mean_abs_step_conf": 0.6758687496185303,
"adv/ratio_final_to_reasoning": 1.8805644829985009,
"adv/ratio_step_to_reasoning": 1.6563031074317995,
"adv/std_final_conf": 0.9355793595314026,
"adv/std_reasoning": 0.6613188982009888,
"adv/std_step_conf": 0.8738494515419006,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 15.1015625,
"calib/ece": 0.11163968253968251,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.05952380952380952,
"calib/gap": 0.008185054945054948,
"calib/mean_conf": 0.6840999999999999,
"calib/mu_c": 0.6863736263736264,
"calib/mu_w": 0.6781885714285715,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03675873015873016,
"calib/std_conf": 0.14843343492345182,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5284627035213935,
"calib/step_q_c_n": 2641.0,
"calib/step_q_gap": 0.004620744337719973,
"calib/step_q_w": 0.5238419591836735,
"calib/step_q_w_n": 1225.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2889.0,
"completions/max_terminated_length": 2889.0,
"completions/mean_length": 781.9296875,
"completions/mean_terminated_length": 791.2015991210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 359.0,
"epoch": 0.05226666666666667,
"grad_norm": 1.0000989437103271,
"kl": 0.19378662109375,
"learning_rate": 4.222222222222223e-06,
"loss": -0.0547,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.019703930243849754,
"mask/share_reasoning": 0.7719682455062866,
"mask/share_step_conf": 0.1966090351343155,
"num_tokens": 15832561.0,
"reward": 0.5932475924491882,
"reward_std": 0.12978488206863403,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7670062780380249,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.07964511215686798,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.7690557837486267,
"adv/mean_abs_reasoning": 0.3387686312198639,
"adv/mean_abs_step_conf": 0.6234382390975952,
"adv/ratio_final_to_reasoning": 2.270150518303162,
"adv/ratio_step_to_reasoning": 1.8403068691828737,
"adv/std_final_conf": 0.9357212781906128,
"adv/std_reasoning": 0.6402995586395264,
"adv/std_step_conf": 0.8528716564178467,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 14.671875,
"calib/ece": 0.16543999999999998,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.08,
"calib/gap": -0.022112266112266132,
"calib/mean_conf": 0.6725599999999999,
"calib/mu_c": 0.6668108108108107,
"calib/mu_w": 0.6889230769230769,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04900000000000003,
"calib/std_conf": 0.14565935054091103,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5091788743253663,
"calib/step_q_c_n": 2594.0,
"calib/step_q_gap": -0.013506151492189589,
"calib/step_q_w": 0.5226850258175559,
"calib/step_q_w_n": 1162.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1994.0,
"completions/max_terminated_length": 1994.0,
"completions/mean_length": 748.32421875,
"completions/mean_terminated_length": 754.216552734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.9171644449234009,
"kl": 0.2061767578125,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0888,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.02141934633255005,
"mask/share_reasoning": 0.7716706991195679,
"mask/share_step_conf": 0.19909745454788208,
"num_tokens": 16129492.0,
"reward": 0.5873520970344543,
"reward_std": 0.12604889273643494,
"rewards/accuracy_reward_step": 0.7265625,
"rewards/final_brier_reward_step": 0.7552015781402588,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.07887758314609528,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.758726954460144,
"adv/mean_abs_reasoning": 0.36322999000549316,
"adv/mean_abs_step_conf": 0.6598608493804932,
"adv/ratio_final_to_reasoning": 2.0888334535611164,
"adv/ratio_step_to_reasoning": 1.8166474892960078,
"adv/std_final_conf": 0.9354365468025208,
"adv/std_reasoning": 0.6610972285270691,
"adv/std_step_conf": 0.8887801766395569,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 14.765625,
"calib/ece": 0.1910546875,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.11328125,
"calib/gap": -0.06335106382978706,
"calib/mean_conf": 0.7034765625,
"calib/mu_c": 0.6866489361702128,
"calib/mu_w": 0.7499999999999999,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08007812499999997,
"calib/std_conf": 0.14585424364132704,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5035037065938354,
"calib/step_q_c_n": 2563.0,
"calib/step_q_gap": -0.04075266152448831,
"calib/step_q_w": 0.5442563681183237,
"calib/step_q_w_n": 1217.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2102.0,
"completions/max_terminated_length": 2102.0,
"completions/mean_length": 803.07421875,
"completions/mean_terminated_length": 809.3976440429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 257.0,
"epoch": 0.0544,
"grad_norm": 1.2030285596847534,
"kl": 0.195037841796875,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0091,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020157014951109886,
"mask/share_reasoning": 0.7813325524330139,
"mask/share_step_conf": 0.19069793820381165,
"num_tokens": 16444375.0,
"reward": 0.6015352606773376,
"reward_std": 0.12200477719306946,
"rewards/accuracy_reward_step": 0.734375,
"rewards/final_brier_reward_step": 0.7579878568649292,
"rewards/format_reward_step": 1.0,
"rewards/step_margin_reward": 0.09820760786533356,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.7669674158096313,
"adv/mean_abs_reasoning": 0.18372583389282227,
"adv/mean_abs_step_conf": 0.705299973487854,
"adv/ratio_final_to_reasoning": 4.174521348244619,
"adv/ratio_step_to_reasoning": 3.8388720766361883,
"adv/std_final_conf": 0.9351452589035034,
"adv/std_reasoning": 0.4374311864376068,
"adv/std_step_conf": 0.903425395488739,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 13.734375,
"calib/ece": 0.16539062499999999,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.09765625,
"calib/gap": 0.0019213250517599034,
"calib/mean_conf": 0.67875,
"calib/mu_c": 0.6790952380952382,
"calib/mu_w": 0.6771739130434783,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.011914062499999978,
"calib/std_conf": 0.1553499074669824,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5023640167364016,
"calib/step_q_c_n": 2868.0,
"calib/step_q_gap": 0.03392265871171024,
"calib/step_q_w": 0.4684413580246914,
"calib/step_q_w_n": 648.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1607.0,
"completions/max_terminated_length": 1607.0,
"completions/mean_length": 760.84375,
"completions/mean_terminated_length": 766.8346557617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 289.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.9788868427276611,
"kl": 0.2137451171875,
"learning_rate": 4.138888888888889e-06,
"loss": -0.0505,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.021420970559120178,
"mask/share_reasoning": 0.7817922234535217,
"mask/share_step_conf": 0.1889742910861969,
"num_tokens": 16747103.0,
"reward": 0.6501138806343079,
"reward_std": 0.08566803485155106,
"rewards/accuracy_reward_step": 0.8203125,
"rewards/final_brier_reward_step": 0.8089929819107056,
"rewards/format_reward_step": 1.0,
"rewards/step_margin_reward": 0.12717223167419434,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.7650182247161865,
"adv/mean_abs_reasoning": 0.36879515647888184,
"adv/mean_abs_step_conf": 0.6698992252349854,
"adv/ratio_final_to_reasoning": 2.0743716702255375,
"adv/ratio_step_to_reasoning": 1.816453425340323,
"adv/std_final_conf": 0.9354962706565857,
"adv/std_reasoning": 0.6402270197868347,
"adv/std_step_conf": 0.8737857341766357,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 13.5703125,
"calib/ece": 0.20097656250000007,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.08203125,
"calib/gap": -0.05537704918032793,
"calib/mean_conf": 0.7151953124999999,
"calib/mu_c": 0.702,
"calib/mu_w": 0.7573770491803279,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07722656250000005,
"calib/std_conf": 0.14481656277176083,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5216705336426914,
"calib/step_q_c_n": 2586.0,
"calib/step_q_gap": -0.020367754645596814,
"calib/step_q_w": 0.5420382882882883,
"calib/step_q_w_n": 888.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1754.0,
"completions/max_terminated_length": 1754.0,
"completions/mean_length": 784.01953125,
"completions/mean_terminated_length": 790.1929321289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 279.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.5744272470474243,
"kl": 0.21148681640625,
"learning_rate": 4.111111111111111e-06,
"loss": -0.0426,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.019791677594184875,
"mask/share_reasoning": 0.7987407445907593,
"mask/share_step_conf": 0.17365510761737823,
"num_tokens": 17053636.0,
"reward": 0.6078897714614868,
"reward_std": 0.12828344106674194,
"rewards/accuracy_reward_step": 0.76171875,
"rewards/final_brier_reward_step": 0.7752581834793091,
"rewards/format_reward_step": 1.0,
"rewards/step_margin_reward": 0.08817760646343231,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.7916830778121948,
"adv/mean_abs_reasoning": 0.2069225311279297,
"adv/mean_abs_step_conf": 0.5848709344863892,
"adv/ratio_final_to_reasoning": 3.8259877911639184,
"adv/ratio_step_to_reasoning": 2.826521265220718,
"adv/std_final_conf": 0.9353046417236328,
"adv/std_reasoning": 0.4676089584827423,
"adv/std_step_conf": 0.8257472515106201,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 11.6328125,
"calib/ece": 0.17996093750000003,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.07421875,
"calib/gap": 0.02836110082685428,
"calib/mean_conf": 0.6812890625000001,
"calib/mu_c": 0.6853881278538814,
"calib/mu_w": 0.6570270270270271,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.002890624999999994,
"calib/std_conf": 0.15071022756558725,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.49881886642885453,
"calib/step_q_c_n": 2523.0,
"calib/step_q_gap": 0.010269415879403943,
"calib/step_q_w": 0.4885494505494506,
"calib/step_q_w_n": 455.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2014.0,
"completions/max_terminated_length": 2014.0,
"completions/mean_length": 663.92578125,
"completions/mean_terminated_length": 669.153564453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 233.0,
"epoch": 0.0576,
"grad_norm": 2.6270413398742676,
"kl": 0.2523193359375,
"learning_rate": 4.083333333333334e-06,
"loss": 0.0061,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.025035999715328217,
"mask/share_reasoning": 0.7858097553253174,
"mask/share_step_conf": 0.1813417375087738,
"num_tokens": 17329833.0,
"reward": 0.6468451023101807,
"reward_std": 0.09642485529184341,
"rewards/accuracy_reward_step": 0.85546875,
"rewards/final_brier_reward_step": 0.8303191661834717,
"rewards/format_reward_step": 1.0,
"rewards/step_margin_reward": 0.09227728843688965,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.7900725603103638,
"adv/mean_abs_reasoning": 0.38732653856277466,
"adv/mean_abs_step_conf": 0.6862166523933411,
"adv/ratio_final_to_reasoning": 2.0398100353309907,
"adv/ratio_step_to_reasoning": 1.7716747603705052,
"adv/std_final_conf": 0.9358355402946472,
"adv/std_reasoning": 0.6612837910652161,
"adv/std_step_conf": 0.9048165678977966,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 12.78125,
"calib/ece": 0.17815999999999996,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.092,
"calib/gap": -0.0014392982922452058,
"calib/mean_conf": 0.7173600000000001,
"calib/mu_c": 0.716778523489933,
"calib/mu_w": 0.7182178217821782,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14975999999999998,
"calib/std_conf": 0.14797915528884464,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4989834660134722,
"calib/step_q_c_n": 1633.0,
"calib/step_q_gap": -0.048270957415447846,
"calib/step_q_w": 0.5472544234289201,
"calib/step_q_w_n": 1639.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2740.0,
"completions/max_terminated_length": 2740.0,
"completions/mean_length": 703.51171875,
"completions/mean_terminated_length": 717.5259399414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.058666666666666666,
"grad_norm": 27.244579315185547,
"kl": 0.24969482421875,
"learning_rate": 4.055555555555556e-06,
"loss": -0.0733,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.022532660514116287,
"mask/share_reasoning": 0.7766234278678894,
"mask/share_step_conf": 0.181312695145607,
"num_tokens": 17617756.0,
"reward": 0.5515916347503662,
"reward_std": 0.1400744915008545,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7049773931503296,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.08648727834224701,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.7837743163108826,
"adv/mean_abs_reasoning": 0.4821297526359558,
"adv/mean_abs_step_conf": 0.6652582883834839,
"adv/ratio_final_to_reasoning": 1.6256501740988614,
"adv/ratio_step_to_reasoning": 1.3798324719565769,
"adv/std_final_conf": 0.9359019994735718,
"adv/std_reasoning": 0.7206350564956665,
"adv/std_step_conf": 0.8749900460243225,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 12.25,
"calib/ece": 0.19551732283464568,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.2283464566929134,
"calib/gap": -0.008955006894740158,
"calib/mean_conf": 0.7592070866141732,
"calib/mu_c": 0.7557872611464969,
"calib/mu_w": 0.764742268041237,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1683070866141732,
"calib/std_conf": 0.1681407624961063,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5358365802011646,
"calib/step_q_c_n": 1889.0,
"calib/step_q_gap": -0.03024425380043927,
"calib/step_q_w": 0.5660808340016039,
"calib/step_q_w_n": 1247.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2348.0,
"completions/max_terminated_length": 2348.0,
"completions/mean_length": 704.9140625,
"completions/mean_terminated_length": 710.4645385742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 244.0,
"epoch": 0.05973333333333333,
"grad_norm": 16.545696258544922,
"kl": 0.248779296875,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0361,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.023570137098431587,
"mask/share_reasoning": 0.7856115102767944,
"mask/share_step_conf": 0.18300580978393555,
"num_tokens": 17905054.0,
"reward": 0.5521728992462158,
"reward_std": 0.17187048494815826,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7059837579727173,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.07726814597845078,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.7717324495315552,
"adv/mean_abs_reasoning": 0.3633062541484833,
"adv/mean_abs_step_conf": 0.6912558078765869,
"adv/ratio_final_to_reasoning": 2.124192580555324,
"adv/ratio_step_to_reasoning": 1.9026807273019601,
"adv/std_final_conf": 0.9351680874824524,
"adv/std_reasoning": 0.640233039855957,
"adv/std_step_conf": 0.8745987415313721,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 12.82421875,
"calib/ece": 0.15373545816733064,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.2908366533864542,
"calib/gap": -0.017319962894248753,
"calib/mean_conf": 0.8026788844621514,
"calib/mu_c": 0.7988836734693877,
"calib/mu_w": 0.8162036363636365,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0877689243027888,
"calib/std_conf": 0.15399733011199934,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5754106598984772,
"calib/step_q_c_n": 2364.0,
"calib/step_q_gap": -0.03773406262600598,
"calib/step_q_w": 0.6131447225244832,
"calib/step_q_w_n": 919.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2166.0,
"completions/max_terminated_length": 2166.0,
"completions/mean_length": 663.44921875,
"completions/mean_terminated_length": 676.6653442382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 353.0,
"epoch": 0.0608,
"grad_norm": 61.37419509887695,
"kl": 0.268646240234375,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0286,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.022753167897462845,
"mask/share_reasoning": 0.7694884538650513,
"mask/share_step_conf": 0.188227117061615,
"num_tokens": 18181689.0,
"reward": 0.6276879906654358,
"reward_std": 0.1472686231136322,
"rewards/accuracy_reward_step": 0.765625,
"rewards/final_brier_reward_step": 0.7831728458404541,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.1229843869805336,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.7331936359405518,
"adv/mean_abs_reasoning": 0.4673541188240051,
"adv/mean_abs_step_conf": 0.6613420248031616,
"adv/ratio_final_to_reasoning": 1.5688181753602042,
"adv/ratio_step_to_reasoning": 1.4150769152677733,
"adv/std_final_conf": 0.9347412586212158,
"adv/std_reasoning": 0.7393353581428528,
"adv/std_step_conf": 0.8752701282501221,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 13.54296875,
"calib/ece": 0.20596787148594387,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.357429718875502,
"calib/gap": 0.03512728550295863,
"calib/mean_conf": 0.8160401606425703,
"calib/mu_c": 0.8273260355029587,
"calib/mu_w": 0.7921987500000001,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1716465863453816,
"calib/std_conf": 0.19137137908397986,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.639537118491921,
"calib/step_q_c_n": 2228.0,
"calib/step_q_gap": -0.017584834696133922,
"calib/step_q_w": 0.6571219531880549,
"calib/step_q_w_n": 1239.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2147.0,
"completions/max_terminated_length": 2147.0,
"completions/mean_length": 772.9765625,
"completions/mean_terminated_length": 794.706787109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 334.0,
"epoch": 0.06186666666666667,
"grad_norm": 9.372441291809082,
"kl": 0.243499755859375,
"learning_rate": 3.972222222222223e-06,
"loss": -0.0652,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.020479410886764526,
"mask/share_reasoning": 0.7819588780403137,
"mask/share_step_conf": 0.17021796107292175,
"num_tokens": 18485891.0,
"reward": 0.573380708694458,
"reward_std": 0.17932750284671783,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7214945554733276,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.09870444983243942,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.7504489421844482,
"adv/mean_abs_reasoning": 0.4621698260307312,
"adv/mean_abs_step_conf": 0.6386820673942566,
"adv/ratio_final_to_reasoning": 1.6237514868280225,
"adv/ratio_step_to_reasoning": 1.3819207387022028,
"adv/std_final_conf": 0.9361416101455688,
"adv/std_reasoning": 0.7392333149909973,
"adv/std_step_conf": 0.8436469435691833,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 11.9453125,
"calib/ece": 0.2620114173228346,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.421259842519685,
"calib/gap": -0.013952311401020934,
"calib/mean_conf": 0.8032641732283463,
"calib/mu_c": 0.7987598837209302,
"calib/mu_w": 0.8127121951219511,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1940551181102362,
"calib/std_conf": 0.26044774079447564,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6302592832254853,
"calib/step_q_c_n": 2009.0,
"calib/step_q_gap": -0.0004373802635517787,
"calib/step_q_w": 0.6306966634890371,
"calib/step_q_w_n": 1049.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1729.0,
"completions/max_terminated_length": 1729.0,
"completions/mean_length": 716.1640625,
"completions/mean_terminated_length": 724.6561279296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.06293333333333333,
"grad_norm": 8.109827041625977,
"kl": 0.267791748046875,
"learning_rate": 3.944444444444445e-06,
"loss": -0.0357,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.023571684956550598,
"mask/share_reasoning": 0.7911335229873657,
"mask/share_step_conf": 0.17357605695724487,
"num_tokens": 18775477.0,
"reward": 0.5561695098876953,
"reward_std": 0.22776079177856445,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.6845602989196777,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.09574736654758453,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.783907413482666,
"adv/mean_abs_reasoning": 0.44255656003952026,
"adv/mean_abs_step_conf": 0.6574190855026245,
"adv/ratio_final_to_reasoning": 1.7713157690232026,
"adv/ratio_step_to_reasoning": 1.48550297264584,
"adv/std_final_conf": 0.9329642653465271,
"adv/std_reasoning": 0.7206140160560608,
"adv/std_step_conf": 0.8595014214515686,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 12.2890625,
"calib/ece": 0.20925992063492066,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.46825396825396826,
"calib/gap": 0.10986593680709522,
"calib/mean_conf": 0.8168511904761905,
"calib/mu_c": 0.8552170731707316,
"calib/mu_w": 0.7453511363636364,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1876587301587302,
"calib/std_conf": 0.23684537928585173,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6476460933403252,
"calib/step_q_c_n": 1907.0,
"calib/step_q_gap": 0.016674503348396263,
"calib/step_q_w": 0.6309715899919289,
"calib/step_q_w_n": 1239.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2673.0,
"completions/max_terminated_length": 2673.0,
"completions/mean_length": 708.765625,
"completions/mean_terminated_length": 720.0159301757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 262.0,
"epoch": 0.064,
"grad_norm": 10.051220893859863,
"kl": 0.273529052734375,
"learning_rate": 3.916666666666667e-06,
"loss": -0.0035,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.022933753207325935,
"mask/share_reasoning": 0.7813419103622437,
"mask/share_step_conf": 0.18009933829307556,
"num_tokens": 19065777.0,
"reward": 0.5763288736343384,
"reward_std": 0.19373396039009094,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7193988561630249,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.11138393729925156,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.7766209244728088,
"adv/mean_abs_reasoning": 0.39034169912338257,
"adv/mean_abs_step_conf": 0.601711630821228,
"adv/ratio_final_to_reasoning": 1.9895925191106159,
"adv/ratio_step_to_reasoning": 1.5414997479709023,
"adv/std_final_conf": 0.9335206151008606,
"adv/std_reasoning": 0.6815312504768372,
"adv/std_step_conf": 0.8435513377189636,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 10.609375,
"calib/ece": 0.31292980392156877,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5137254901960784,
"calib/gap": -0.034817016777103915,
"calib/mean_conf": 0.7993839215686274,
"calib/mu_c": 0.7881878612716764,
"calib/mu_w": 0.8230048780487803,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21694117647058836,
"calib/std_conf": 0.28078543733590494,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6056639582124204,
"calib/step_q_c_n": 1723.0,
"calib/step_q_gap": -0.015997975322322944,
"calib/step_q_w": 0.6216619335347433,
"calib/step_q_w_n": 993.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2242.0,
"completions/max_terminated_length": 2242.0,
"completions/mean_length": 617.234375,
"completions/mean_terminated_length": 622.094482421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.06506666666666666,
"grad_norm": 44.98173904418945,
"kl": 0.3096923828125,
"learning_rate": 3.88888888888889e-06,
"loss": -0.0122,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.027154112234711647,
"mask/share_reasoning": 0.7856699228286743,
"mask/share_step_conf": 0.17936351895332336,
"num_tokens": 19327853.0,
"reward": 0.5646814107894897,
"reward_std": 0.22981145977973938,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.6698556542396545,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.1282571256160736,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.7539002299308777,
"adv/mean_abs_reasoning": 0.48199620842933655,
"adv/mean_abs_step_conf": 0.6898258924484253,
"adv/ratio_final_to_reasoning": 1.564120664740465,
"adv/ratio_step_to_reasoning": 1.4311853088976276,
"adv/std_final_conf": 0.9354040622711182,
"adv/std_reasoning": 0.739365816116333,
"adv/std_step_conf": 0.8912215232849121,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 12.515625,
"calib/ece": 0.2386551587301588,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5277777777777778,
"calib/gap": 0.0037250000000000894,
"calib/mean_conf": 0.8140440476190476,
"calib/mu_c": 0.8151083333333334,
"calib/mu_w": 0.8113833333333333,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16920674603174612,
"calib/std_conf": 0.27045332652723747,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5826730943738656,
"calib/step_q_c_n": 2204.0,
"calib/step_q_gap": -0.03697860562613453,
"calib/step_q_w": 0.6196517000000001,
"calib/step_q_w_n": 1000.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2931.0,
"completions/max_terminated_length": 2931.0,
"completions/mean_length": 687.2734375,
"completions/mean_terminated_length": 698.1825561523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.06613333333333334,
"grad_norm": 37.392425537109375,
"kl": 0.27716064453125,
"learning_rate": 3.861111111111112e-06,
"loss": -0.0545,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.02452751062810421,
"mask/share_reasoning": 0.7799397706985474,
"mask/share_step_conf": 0.1799076795578003,
"num_tokens": 19610875.0,
"reward": 0.5806684494018555,
"reward_std": 0.24473421275615692,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7016507387161255,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.12296748161315918,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.7358971834182739,
"adv/mean_abs_reasoning": 0.432090163230896,
"adv/mean_abs_step_conf": 0.6293022036552429,
"adv/ratio_final_to_reasoning": 1.7031102442038992,
"adv/ratio_step_to_reasoning": 1.4564140941087862,
"adv/std_final_conf": 0.9351003170013428,
"adv/std_reasoning": 0.720492422580719,
"adv/std_step_conf": 0.8755343556404114,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 11.20703125,
"calib/ece": 0.27219724409448814,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.4448818897637795,
"calib/gap": 0.007648740310077673,
"calib/mean_conf": 0.8210311023622047,
"calib/mu_c": 0.8236208333333334,
"calib/mu_w": 0.8159720930232557,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21590551181102358,
"calib/std_conf": 0.23959935279423172,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5948911862527716,
"calib/step_q_c_n": 1804.0,
"calib/step_q_gap": 0.0020115618396261414,
"calib/step_q_w": 0.5928796244131455,
"calib/step_q_w_n": 1065.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2110.0,
"completions/max_terminated_length": 2110.0,
"completions/mean_length": 703.6953125,
"completions/mean_terminated_length": 712.03955078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.0672,
"grad_norm": 1.3926122188568115,
"kl": 0.2791748046875,
"learning_rate": 3.833333333333334e-06,
"loss": 0.0205,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02410535141825676,
"mask/share_reasoning": 0.7952975034713745,
"mask/share_step_conf": 0.168878436088562,
"num_tokens": 19899661.0,
"reward": 0.5846147537231445,
"reward_std": 0.2183077484369278,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.6880985498428345,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.15300604701042175,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.7396208047866821,
"adv/mean_abs_reasoning": 0.36050862073898315,
"adv/mean_abs_step_conf": 0.6242177486419678,
"adv/ratio_final_to_reasoning": 2.0516036572733864,
"adv/ratio_step_to_reasoning": 1.7314918776766681,
"adv/std_final_conf": 0.9338341951370239,
"adv/std_reasoning": 0.6403324007987976,
"adv/std_step_conf": 0.8599122762680054,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 11.1875,
"calib/ece": 0.2487636000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.596,
"calib/gap": -0.030148054807927593,
"calib/mean_conf": 0.8462763999999999,
"calib/mu_c": 0.8381967213114754,
"calib/mu_w": 0.868344776119403,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18152000000000013,
"calib/std_conf": 0.251969499668194,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5824359238291301,
"calib/step_q_c_n": 1943.0,
"calib/step_q_gap": -0.06967851699605987,
"calib/step_q_w": 0.65211444082519,
"calib/step_q_w_n": 921.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 1581.0,
"completions/max_terminated_length": 1581.0,
"completions/mean_length": 611.25,
"completions/mean_terminated_length": 625.9200439453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 226.0,
"epoch": 0.06826666666666667,
"grad_norm": 1.2419276237487793,
"kl": 0.314239501953125,
"learning_rate": 3.8055555555555556e-06,
"loss": -0.0402,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.025241907685995102,
"mask/share_reasoning": 0.7825311422348022,
"mask/share_step_conf": 0.16878946125507355,
"num_tokens": 20159917.0,
"reward": 0.5861185193061829,
"reward_std": 0.2304370403289795,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.6986793279647827,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.13527649641036987,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.7525701522827148,
"adv/mean_abs_reasoning": 0.2651790380477905,
"adv/mean_abs_step_conf": 0.6145387291908264,
"adv/ratio_final_to_reasoning": 2.8379699912294227,
"adv/ratio_step_to_reasoning": 2.3174483689018976,
"adv/std_final_conf": 0.9266197085380554,
"adv/std_reasoning": 0.572488009929657,
"adv/std_step_conf": 0.8432523012161255,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 10.234375,
"calib/ece": 0.29576210937499997,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.64453125,
"calib/gap": 0.008698574697613926,
"calib/mean_conf": 0.897831640625,
"calib/mu_c": 0.9010596273291926,
"calib/mu_w": 0.8923610526315787,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.28234374999999995,
"calib/std_conf": 0.15485483769221853,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5674722730042657,
"calib/step_q_c_n": 1641.0,
"calib/step_q_gap": 0.014914969633479158,
"calib/step_q_w": 0.5525573033707866,
"calib/step_q_w_n": 979.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1508.0,
"completions/max_terminated_length": 1508.0,
"completions/mean_length": 585.296875,
"completions/mean_terminated_length": 589.905517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.06933333333333333,
"grad_norm": 1.0370111465454102,
"kl": 0.32586669921875,
"learning_rate": 3.777777777777778e-06,
"loss": 0.0075,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.027405641973018646,
"mask/share_reasoning": 0.785406231880188,
"mask/share_step_conf": 0.17937563359737396,
"num_tokens": 20414777.0,
"reward": 0.5884789824485779,
"reward_std": 0.1469491571187973,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6730327606201172,
"rewards/format_reward_step": 0.99609375,
"rewards/step_margin_reward": 0.17892518639564514,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.76250159740448,
"adv/mean_abs_reasoning": 0.43535977602005005,
"adv/mean_abs_step_conf": 0.6619211435317993,
"adv/ratio_final_to_reasoning": 1.7514286789998803,
"adv/ratio_step_to_reasoning": 1.5204003217360054,
"adv/std_final_conf": 0.9311895370483398,
"adv/std_reasoning": 0.7207090854644775,
"adv/std_step_conf": 0.875627338886261,
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 13.01953125,
"calib/ece": 0.40874493927125516,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.6437246963562753,
"calib/gap": -0.01084218811491533,
"calib/mean_conf": 0.9076923076923077,
"calib/mu_c": 0.9023809523809524,
"calib/mu_w": 0.9132231404958677,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4031578947368422,
"calib/std_conf": 0.10402009723648364,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4740751240255139,
"calib/step_q_c_n": 1411.0,
"calib/step_q_gap": -0.07345349199946005,
"calib/step_q_w": 0.5475286160249739,
"calib/step_q_w_n": 1922.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2400.0,
"completions/max_terminated_length": 2400.0,
"completions/mean_length": 722.234375,
"completions/mean_terminated_length": 748.5505981445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 221.0,
"epoch": 0.0704,
"grad_norm": 1.0291537046432495,
"kl": 0.26129150390625,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.1164,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.02183469384908676,
"mask/share_reasoning": 0.7824565172195435,
"mask/share_step_conf": 0.16055256128311157,
"num_tokens": 20706021.0,
"reward": 0.4741209149360657,
"reward_std": 0.22096720337867737,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.554445743560791,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.1039523333311081,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.7682864665985107,
"adv/mean_abs_reasoning": 0.4244164228439331,
"adv/mean_abs_step_conf": 0.6096725463867188,
"adv/ratio_final_to_reasoning": 1.8102185147557919,
"adv/ratio_step_to_reasoning": 1.4364961240222984,
"adv/std_final_conf": 0.9361712336540222,
"adv/std_reasoning": 0.7204695343971252,
"adv/std_step_conf": 0.84321129322052,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 10.83984375,
"calib/ece": 0.28509249011857707,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.4268774703557312,
"calib/gap": -0.04268690137023479,
"calib/mean_conf": 0.8048679841897233,
"calib/mu_c": 0.7895141975308642,
"calib/mu_w": 0.832201098901099,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.22482213438735177,
"calib/std_conf": 0.21552196379980912,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.2893720588235294,
"calib/step_q_c_n": 1700.0,
"calib/step_q_gap": -0.07566049931600544,
"calib/step_q_w": 0.36503255813953484,
"calib/step_q_w_n": 1075.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2158.0,
"completions/max_terminated_length": 2158.0,
"completions/mean_length": 694.09375,
"completions/mean_terminated_length": 702.3241577148438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 284.0,
"epoch": 0.07146666666666666,
"grad_norm": 1.5884639024734497,
"kl": 0.27716064453125,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.0467,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.023152736946940422,
"mask/share_reasoning": 0.803122878074646,
"mask/share_step_conf": 0.16200563311576843,
"num_tokens": 20988717.0,
"reward": 0.5525285005569458,
"reward_std": 0.2043268233537674,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6637780070304871,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.11940403282642365,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.7617519497871399,
"adv/mean_abs_reasoning": 0.3916369080543518,
"adv/mean_abs_step_conf": 0.5791916251182556,
"adv/ratio_final_to_reasoning": 1.9450463787274694,
"adv/ratio_step_to_reasoning": 1.4788994939105045,
"adv/std_final_conf": 0.9319273233413696,
"adv/std_reasoning": 0.6612851023674011,
"adv/std_step_conf": 0.8105087876319885,
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 11.60546875,
"calib/ece": 0.2744149797570849,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.5748987854251012,
"calib/gap": -0.0012663043478261526,
"calib/mean_conf": 0.8724716599190284,
"calib/mu_c": 0.872,
"calib/mu_w": 0.8732663043478261,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25967813765182174,
"calib/std_conf": 0.15390637173820043,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.29178073510773134,
"calib/step_q_c_n": 1578.0,
"calib/step_q_gap": -0.16684094471997857,
"calib/step_q_w": 0.4586216798277099,
"calib/step_q_w_n": 1393.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2912.0,
"completions/max_terminated_length": 2912.0,
"completions/mean_length": 709.953125,
"completions/mean_terminated_length": 726.9920043945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.07253333333333334,
"grad_norm": 1.2024588584899902,
"kl": 0.277191162109375,
"learning_rate": 3.694444444444445e-06,
"loss": -0.0324,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.02384016662836075,
"mask/share_reasoning": 0.7884659171104431,
"mask/share_step_conf": 0.16425639390945435,
"num_tokens": 21274553.0,
"reward": 0.5623621940612793,
"reward_std": 0.1624753475189209,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.6578570008277893,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.1543673574924469,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.7745255827903748,
"adv/mean_abs_reasoning": 0.5398683547973633,
"adv/mean_abs_step_conf": 0.6580164432525635,
"adv/ratio_final_to_reasoning": 1.4346563859648502,
"adv/ratio_step_to_reasoning": 1.218846108324957,
"adv/std_final_conf": 0.9221952557563782,
"adv/std_reasoning": 0.7754866480827332,
"adv/std_step_conf": 0.8600015044212341,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 12.41015625,
"calib/ece": 0.36751004016064265,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.8755020080321285,
"calib/gap": 0.01665674603174594,
"calib/mean_conf": 0.9458232931726909,
"calib/mu_c": 0.9528472222222221,
"calib/mu_w": 0.9361904761904761,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.36751004016064265,
"calib/std_conf": 0.0792600373053975,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.39045735475896165,
"calib/step_q_c_n": 1618.0,
"calib/step_q_gap": -0.11180691721024932,
"calib/step_q_w": 0.502264271969211,
"calib/step_q_w_n": 1559.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2759.0,
"completions/max_terminated_length": 2759.0,
"completions/mean_length": 766.44921875,
"completions/mean_terminated_length": 784.8440551757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 334.0,
"epoch": 0.0736,
"grad_norm": 1.6587779521942139,
"kl": 0.2686767578125,
"learning_rate": 3.6666666666666666e-06,
"loss": -0.0302,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.02062370441854,
"mask/share_reasoning": 0.8001226186752319,
"mask/share_step_conf": 0.15581616759300232,
"num_tokens": 21575260.0,
"reward": 0.5319957733154297,
"reward_std": 0.24367490410804749,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6033320426940918,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.15440955758094788,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.6887208223342896,
"adv/mean_abs_reasoning": 0.3762162923812866,
"adv/mean_abs_step_conf": 0.5653591156005859,
"adv/ratio_final_to_reasoning": 1.8306512404738886,
"adv/ratio_step_to_reasoning": 1.5027502185567427,
"adv/std_final_conf": 0.8945690989494324,
"adv/std_reasoning": 0.6815224289894104,
"adv/std_step_conf": 0.8105819225311279,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 11.78515625,
"calib/ece": 0.3446093117408908,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.9311740890688259,
"calib/gap": 0.0021444281524929387,
"calib/mean_conf": 0.9600789473684209,
"calib/mu_c": 0.9608863636363637,
"calib/mu_w": 0.9587419354838708,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.34060323886639693,
"calib/std_conf": 0.07443519341229098,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5036967632027257,
"calib/step_q_c_n": 1761.0,
"calib/step_q_gap": -0.028866134886446293,
"calib/step_q_w": 0.532562898089172,
"calib/step_q_w_n": 1256.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2785.0,
"completions/max_terminated_length": 2785.0,
"completions/mean_length": 755.53515625,
"completions/mean_terminated_length": 773.6680297851562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.07466666666666667,
"grad_norm": 1.2605822086334229,
"kl": 0.252716064453125,
"learning_rate": 3.638888888888889e-06,
"loss": -0.0585,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.021993208676576614,
"mask/share_reasoning": 0.7888750433921814,
"mask/share_step_conf": 0.1656942367553711,
"num_tokens": 21875669.0,
"reward": 0.5579054355621338,
"reward_std": 0.17114701867103577,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6246556043624878,
"rewards/format_reward_step": 0.9609375,
"rewards/step_margin_reward": 0.17865526676177979,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.7037140130996704,
"adv/mean_abs_reasoning": 0.6130764484405518,
"adv/mean_abs_step_conf": 0.6614584922790527,
"adv/ratio_final_to_reasoning": 1.1478405586932403,
"adv/ratio_step_to_reasoning": 1.07891682018053,
"adv/std_final_conf": 0.918972373008728,
"adv/std_reasoning": 0.8588365316390991,
"adv/std_step_conf": 0.8758206963539124,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 12.41796875,
"calib/ece": 0.32935714285714296,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9206349206349206,
"calib/gap": 0.004954610606784482,
"calib/mean_conf": 0.963484126984127,
"calib/mu_c": 0.9652732919254657,
"calib/mu_w": 0.9603186813186813,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3269761904761906,
"calib/std_conf": 0.05988026864873096,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5245230607966457,
"calib/step_q_c_n": 1908.0,
"calib/step_q_gap": -0.06256584557628908,
"calib/step_q_w": 0.5870889063729348,
"calib/step_q_w_n": 1271.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2907.0,
"completions/max_terminated_length": 2907.0,
"completions/mean_length": 736.09375,
"completions/mean_terminated_length": 747.77783203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 248.0,
"epoch": 0.07573333333333333,
"grad_norm": 1.374493956565857,
"kl": 0.26165771484375,
"learning_rate": 3.6111111111111115e-06,
"loss": -0.0009,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.022440209984779358,
"mask/share_reasoning": 0.7956840991973877,
"mask/share_step_conf": 0.16625072062015533,
"num_tokens": 22168517.0,
"reward": 0.5891203880310059,
"reward_std": 0.2593870759010315,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6522749662399292,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.203309565782547,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.7037005424499512,
"adv/mean_abs_reasoning": 0.29564470052719116,
"adv/mean_abs_step_conf": 0.602660059928894,
"adv/ratio_final_to_reasoning": 2.380223765875452,
"adv/ratio_step_to_reasoning": 2.0384605536789113,
"adv/std_final_conf": 0.8824068903923035,
"adv/std_reasoning": 0.5959694981575012,
"adv/std_step_conf": 0.8436238169670105,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 11.37109375,
"calib/ece": 0.40128858267716533,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.952755905511811,
"calib/gap": 0.02412757575757596,
"calib/mean_conf": 0.9682177165354331,
"calib/mu_c": 0.9786666666666667,
"calib/mu_w": 0.9545390909090907,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.40128858267716533,
"calib/std_conf": 0.08601273139317762,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5057326007326007,
"calib/step_q_c_n": 1638.0,
"calib/step_q_gap": -0.0382893945541235,
"calib/step_q_w": 0.5440219952867242,
"calib/step_q_w_n": 1273.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2539.0,
"completions/max_terminated_length": 2539.0,
"completions/mean_length": 663.0,
"completions/mean_terminated_length": 668.220458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 262.0,
"epoch": 0.0768,
"grad_norm": 0.9264628887176514,
"kl": 0.29205322265625,
"learning_rate": 3.5833333333333335e-06,
"loss": -0.0476,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02300153858959675,
"mask/share_reasoning": 0.7928996086120605,
"mask/share_step_conf": 0.17628639936447144,
"num_tokens": 22442653.0,
"reward": 0.5886973142623901,
"reward_std": 0.18019168078899384,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.5893194675445557,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.2771375775337219,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.6671165227890015,
"adv/mean_abs_reasoning": 0.49489885568618774,
"adv/mean_abs_step_conf": 0.6618808507919312,
"adv/ratio_final_to_reasoning": 1.3479855835674348,
"adv/ratio_step_to_reasoning": 1.3374063067376047,
"adv/std_final_conf": 0.8313454985618591,
"adv/std_reasoning": 0.7393487691879272,
"adv/std_step_conf": 0.859908938407898,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 11.71484375,
"calib/ece": 0.2669388196078431,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9647058823529412,
"calib/gap": -0.006764875642823909,
"calib/mean_conf": 0.9688337215686275,
"calib/mu_c": 0.9670032258064517,
"calib/mu_w": 0.9737681014492756,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2531803882352941,
"calib/std_conf": 0.11327860894981376,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5308225419664268,
"calib/step_q_c_n": 2085.0,
"calib/step_q_gap": 0.0013051579401686109,
"calib/step_q_w": 0.5295173840262581,
"calib/step_q_w_n": 914.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2538.0,
"completions/max_terminated_length": 2538.0,
"completions/mean_length": 690.875,
"completions/mean_terminated_length": 696.31494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 337.0,
"epoch": 0.07786666666666667,
"grad_norm": 1.6943904161453247,
"kl": 0.269775390625,
"learning_rate": 3.555555555555556e-06,
"loss": -0.0083,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.022829489782452583,
"mask/share_reasoning": 0.7974672317504883,
"mask/share_step_conf": 0.1718907356262207,
"num_tokens": 22726549.0,
"reward": 0.6816904544830322,
"reward_std": 0.24758538603782654,
"rewards/accuracy_reward_step": 0.7265625,
"rewards/final_brier_reward_step": 0.726353645324707,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.2932772636413574,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.684027910232544,
"adv/mean_abs_reasoning": 0.42929649353027344,
"adv/mean_abs_step_conf": 0.6507533192634583,
"adv/ratio_final_to_reasoning": 1.5933694324114185,
"adv/ratio_step_to_reasoning": 1.515859852271465,
"adv/std_final_conf": 0.8723176717758179,
"adv/std_reasoning": 0.7014433145523071,
"adv/std_step_conf": 0.8599750399589539,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 11.33984375,
"calib/ece": 0.41303135999999985,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.98,
"calib/gap": 0.002038238023658656,
"calib/mean_conf": 0.9772017599999999,
"calib/mu_c": 0.9780741258741259,
"calib/mu_w": 0.9760358878504672,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.40911655999999985,
"calib/std_conf": 0.08859416269316168,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5037612244897959,
"calib/step_q_c_n": 1470.0,
"calib/step_q_gap": -0.07674526539157189,
"calib/step_q_w": 0.5805064898813678,
"calib/step_q_w_n": 1433.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2710.0,
"completions/max_terminated_length": 2710.0,
"completions/mean_length": 698.56640625,
"completions/mean_terminated_length": 715.33203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 240.0,
"epoch": 0.07893333333333333,
"grad_norm": 1.0564448833465576,
"kl": 0.2877197265625,
"learning_rate": 3.5277777777777784e-06,
"loss": -0.0091,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.023499177768826485,
"mask/share_reasoning": 0.7832315564155579,
"mask/share_step_conf": 0.1698317527770996,
"num_tokens": 23009310.0,
"reward": 0.5486522912979126,
"reward_std": 0.22563649713993073,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.56647127866745,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.2253645807504654,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.5781558156013489,
"adv/mean_abs_reasoning": 0.2851184606552124,
"adv/mean_abs_step_conf": 0.6577544212341309,
"adv/ratio_final_to_reasoning": 2.0277740496800036,
"adv/ratio_step_to_reasoning": 2.3069513623305475,
"adv/std_final_conf": 0.8220309019088745,
"adv/std_reasoning": 0.5959504246711731,
"adv/std_step_conf": 0.8598629832267761,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 12.52734375,
"calib/ece": 0.17877470355731245,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9604743083003953,
"calib/gap": 0.023058128078817575,
"calib/mean_conf": 0.974901185770751,
"calib/mu_c": 0.9794581280788176,
"calib/mu_w": 0.9564,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17565217391304366,
"calib/std_conf": 0.05979532050524667,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5821794439764112,
"calib/step_q_c_n": 2374.0,
"calib/step_q_gap": -0.08426713465504154,
"calib/step_q_w": 0.6664465786314527,
"calib/step_q_w_n": 833.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1907.0,
"completions/max_terminated_length": 1907.0,
"completions/mean_length": 691.41796875,
"completions/mean_terminated_length": 696.8621826171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 256.0,
"epoch": 0.08,
"grad_norm": 2.8885269165039062,
"kl": 0.262725830078125,
"learning_rate": 3.5e-06,
"loss": -0.0022,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.022322513163089752,
"mask/share_reasoning": 0.7913630604743958,
"mask/share_step_conf": 0.1785019338130951,
"num_tokens": 23291065.0,
"reward": 0.7158107757568359,
"reward_std": 0.1600218415260315,
"rewards/accuracy_reward_step": 0.796875,
"rewards/final_brier_reward_step": 0.8058438301086426,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.26874634623527527,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.596921443939209,
"adv/mean_abs_reasoning": 0.3288942277431488,
"adv/mean_abs_step_conf": 0.6491069197654724,
"adv/ratio_final_to_reasoning": 1.8149343879801292,
"adv/ratio_step_to_reasoning": 1.9736038671751786,
"adv/std_final_conf": 0.8286421298980713,
"adv/std_reasoning": 0.6186842322349548,
"adv/std_step_conf": 0.8600033521652222,
"calib/answer_extract_rate": 0.953125,
"calib/avg_num_step_conf": 15.1484375,
"calib/ece": 0.30355514403292194,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.9670781893004116,
"calib/gap": -0.0019850101705521928,
"calib/mean_conf": 0.9783699588477366,
"calib/mu_c": 0.9777409638554215,
"calib/mu_w": 0.9797259740259737,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.29939876543209887,
"calib/std_conf": 0.05383392961866714,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5899240265906933,
"calib/step_q_c_n": 2106.0,
"calib/step_q_gap": -0.118399336840458,
"calib/step_q_w": 0.7083233634311513,
"calib/step_q_w_n": 1772.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2998.0,
"completions/max_terminated_length": 2998.0,
"completions/mean_length": 768.8359375,
"completions/mean_terminated_length": 806.6475219726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 294.0,
"epoch": 0.08106666666666666,
"grad_norm": 1.231719732284546,
"kl": 0.2412109375,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.1814,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.02048310451209545,
"mask/share_reasoning": 0.7641509771347046,
"mask/share_step_conf": 0.1684909164905548,
"num_tokens": 23590943.0,
"reward": 0.5938359498977661,
"reward_std": 0.18622279167175293,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.6574385762214661,
"rewards/format_reward_step": 0.94921875,
"rewards/step_margin_reward": 0.21070200204849243,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.47663238644599915,
"adv/mean_abs_reasoning": 0.3979429006576538,
"adv/mean_abs_step_conf": 0.7344629764556885,
"adv/ratio_final_to_reasoning": 1.1977406448470382,
"adv/ratio_step_to_reasoning": 1.8456491502722885,
"adv/std_final_conf": 0.7308207750320435,
"adv/std_reasoning": 0.6816669702529907,
"adv/std_step_conf": 0.9212200045585632,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 13.75390625,
"calib/ece": 0.26344840873015873,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9920634920634921,
"calib/gap": -0.001376341886433874,
"calib/mean_conf": 0.9881309484126984,
"calib/mu_c": 0.9877540928961747,
"calib/mu_w": 0.9891304347826085,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2626944404761905,
"calib/std_conf": 0.013123445461125982,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5077341389728096,
"calib/step_q_c_n": 2317.0,
"calib/step_q_gap": -0.10131901717336977,
"calib/step_q_w": 0.6090531561461794,
"calib/step_q_w_n": 1204.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2509.0,
"completions/max_terminated_length": 2509.0,
"completions/mean_length": 751.79296875,
"completions/mean_terminated_length": 760.70751953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 252.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.4560396671295166,
"kl": 0.264556884765625,
"learning_rate": 3.444444444444445e-06,
"loss": -0.0218,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.022371895611286163,
"mask/share_reasoning": 0.7850345373153687,
"mask/share_step_conf": 0.1808748096227646,
"num_tokens": 23888066.0,
"reward": 0.6734690070152283,
"reward_std": 0.225606769323349,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7203949093818665,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.2866993546485901,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.39663469791412354,
"adv/mean_abs_reasoning": 0.3622138798236847,
"adv/mean_abs_step_conf": 0.5514669418334961,
"adv/ratio_final_to_reasoning": 1.0950289870371448,
"adv/ratio_step_to_reasoning": 1.5224898121019945,
"adv/std_final_conf": 0.6700455546379089,
"adv/std_reasoning": 0.6613717079162598,
"adv/std_step_conf": 0.8106377720832825,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 16.52734375,
"calib/ece": 0.33469477911646595,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0003719503495508425,
"calib/mean_conf": 0.9893132530120483,
"calib/mu_c": 0.989441717791411,
"calib/mu_w": 0.9890697674418601,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.33469477911646595,
"calib/std_conf": 0.0037830557892607965,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4919075630252101,
"calib/step_q_c_n": 2380.0,
"calib/step_q_gap": -0.10491037322546526,
"calib/step_q_w": 0.5968179362506754,
"calib/step_q_w_n": 1851.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3072.0,
"completions/mean_length": 884.49609375,
"completions/mean_terminated_length": 909.3613891601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 360.0,
"epoch": 0.0832,
"grad_norm": 0.42842844128608704,
"kl": 0.218841552734375,
"learning_rate": 3.416666666666667e-06,
"loss": -0.1436,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.01720147207379341,
"mask/share_reasoning": 0.7845550775527954,
"mask/share_step_conf": 0.17089968919754028,
"num_tokens": 24222521.0,
"reward": 0.5937488675117493,
"reward_std": 0.1932610720396042,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.6439374685287476,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.2216852605342865,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.44898343086242676,
"adv/mean_abs_reasoning": 0.3736734390258789,
"adv/mean_abs_step_conf": 0.6339391469955444,
"adv/ratio_final_to_reasoning": 1.2015395903783577,
"adv/ratio_step_to_reasoning": 1.6965057742614689,
"adv/std_final_conf": 0.7077057957649231,
"adv/std_reasoning": 0.6614980697631836,
"adv/std_step_conf": 0.8597406148910522,
"calib/answer_extract_rate": 0.91796875,
"calib/avg_num_step_conf": 17.51171875,
"calib/ece": 0.287361659574468,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.000523666666666589,
"calib/mean_conf": 0.9894893191489361,
"calib/mu_c": 0.9893333333333331,
"calib/mu_w": 0.9898569999999997,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.287361659574468,
"calib/std_conf": 0.003286667646308783,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4385850694444444,
"calib/step_q_c_n": 2304.0,
"calib/step_q_gap": -0.2518131815371476,
"calib/step_q_w": 0.690398250981592,
"calib/step_q_w_n": 2179.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 1884.0,
"completions/max_terminated_length": 1884.0,
"completions/mean_length": 773.8515625,
"completions/mean_terminated_length": 843.0042114257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 311.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.5278450846672058,
"kl": 0.23162841796875,
"learning_rate": 3.3888888888888893e-06,
"loss": -0.232,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.01737746223807335,
"mask/share_reasoning": 0.7424191236495972,
"mask/share_step_conf": 0.1581721305847168,
"num_tokens": 24527003.0,
"reward": 0.5930335521697998,
"reward_std": 0.19911476969718933,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.6499668955802917,
"rewards/format_reward_step": 0.91796875,
"rewards/step_margin_reward": 0.22360025346279144,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.4606902301311493,
"adv/mean_abs_reasoning": 0.4467974603176117,
"adv/mean_abs_step_conf": 0.6338987350463867,
"adv/ratio_final_to_reasoning": 1.0310941109729266,
"adv/ratio_step_to_reasoning": 1.4187608286666886,
"adv/std_final_conf": 0.7015774250030518,
"adv/std_reasoning": 0.7015236616134644,
"adv/std_step_conf": 0.8599874377250671,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 15.4453125,
"calib/ece": 0.25608870967741937,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0001515151515150137,
"calib/mean_conf": 0.9899596774193549,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9898484848484849,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25608870967741937,
"calib/std_conf": 0.0010991139653579855,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.39251846094053633,
"calib/step_q_c_n": 2573.0,
"calib/step_q_gap": -0.15269515238314224,
"calib/step_q_w": 0.5452136133236786,
"calib/step_q_w_n": 1381.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1968.0,
"completions/max_terminated_length": 1968.0,
"completions/mean_length": 751.12890625,
"completions/mean_terminated_length": 775.3588256835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 229.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.39397016167640686,
"kl": 0.25689697265625,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.0626,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.019908176735043526,
"mask/share_reasoning": 0.7659863233566284,
"mask/share_step_conf": 0.1828555464744568,
"num_tokens": 24821452.0,
"reward": 0.6637634634971619,
"reward_std": 0.2425382137298584,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.7160730361938477,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.2755163013935089,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.4842243194580078,
"adv/mean_abs_reasoning": 0.43560612201690674,
"adv/mean_abs_step_conf": 0.6790660619735718,
"adv/ratio_final_to_reasoning": 1.111610454912785,
"adv/ratio_step_to_reasoning": 1.5588992616297892,
"adv/std_final_conf": 0.728503406047821,
"adv/std_reasoning": 0.7210630178451538,
"adv/std_step_conf": 0.8755649924278259,
"calib/answer_extract_rate": 0.90625,
"calib/avg_num_step_conf": 18.1484375,
"calib/ece": 0.24861637931034497,
"calib/final_conf_rate": 0.90625,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -5.813953488642554e-06,
"calib/mean_conf": 0.9899956896551726,
"calib/mu_c": 0.9899941860465116,
"calib/mu_w": 0.9900000000000002,
"calib/nonempty_final_conf_rate": 0.90625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24861637931034497,
"calib/std_conf": 0.0015824954236126742,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3576974088713219,
"calib/step_q_c_n": 2277.0,
"calib/step_q_gap": -0.26280913397376043,
"calib/step_q_w": 0.6205065428450823,
"calib/step_q_w_n": 2369.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 2933.0,
"completions/max_terminated_length": 2933.0,
"completions/mean_length": 749.546875,
"completions/mean_terminated_length": 823.5364990234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 356.0,
"epoch": 0.0864,
"grad_norm": 0.5047710537910461,
"kl": 0.24090576171875,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.308,
"mask/has_final_conf_rate": 0.90625,
"mask/share_final_conf": 0.017829496413469315,
"mask/share_reasoning": 0.7260985374450684,
"mask/share_step_conf": 0.16622823476791382,
"num_tokens": 25119584.0,
"reward": 0.6293458938598633,
"reward_std": 0.24940373003482819,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.6764695048332214,
"rewards/format_reward_step": 0.90625,
"rewards/step_margin_reward": 0.2665971517562866,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.4893885850906372,
"adv/mean_abs_reasoning": 0.4772290289402008,
"adv/mean_abs_step_conf": 0.669119119644165,
"adv/ratio_final_to_reasoning": 1.025479498129943,
"adv/ratio_step_to_reasoning": 1.4020922430684932,
"adv/std_final_conf": 0.7392094731330872,
"adv/std_reasoning": 0.7395800352096558,
"adv/std_step_conf": 0.8755518198013306,
"calib/answer_extract_rate": 0.9140625,
"calib/avg_num_step_conf": 17.9296875,
"calib/ece": 0.2990042918454936,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0003895790200137972,
"calib/mean_conf": 0.9899914163090129,
"calib/mu_c": 0.9901118012422361,
"calib/mu_w": 0.9897222222222223,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2990042918454936,
"calib/std_conf": 0.001246424642294338,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.35714638694638695,
"calib/step_q_c_n": 2145.0,
"calib/step_q_gap": -0.2990908318675191,
"calib/step_q_w": 0.6562372188139061,
"calib/step_q_w_n": 2445.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 2197.0,
"completions/max_terminated_length": 2197.0,
"completions/mean_length": 714.0234375,
"completions/mean_terminated_length": 781.1538696289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 382.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.478674054145813,
"kl": 0.247528076171875,
"learning_rate": 3.3055555555555558e-06,
"loss": -0.2793,
"mask/has_final_conf_rate": 0.91015625,
"mask/share_final_conf": 0.018443914130330086,
"mask/share_reasoning": 0.7258409857749939,
"mask/share_step_conf": 0.16977760195732117,
"num_tokens": 25407926.0,
"reward": 0.5754408836364746,
"reward_std": 0.24024444818496704,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6345949172973633,
"rewards/format_reward_step": 0.91015625,
"rewards/step_margin_reward": 0.2084743082523346,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.43164581060409546,
"adv/mean_abs_reasoning": 0.43753427267074585,
"adv/mean_abs_step_conf": 0.6022412776947021,
"adv/ratio_final_to_reasoning": 0.9865417124224196,
"adv/ratio_step_to_reasoning": 1.3764436646724174,
"adv/std_final_conf": 0.7003706693649292,
"adv/std_reasoning": 0.7015554308891296,
"adv/std_step_conf": 0.8437060713768005,
"calib/answer_extract_rate": 0.890625,
"calib/avg_num_step_conf": 19.76171875,
"calib/ece": 0.3889868421052633,
"calib/final_conf_rate": 0.890625,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.9956140350877193,
"calib/gap": 0.011206521739130726,
"calib/mean_conf": 0.9854780701754386,
"calib/mu_c": 0.9900000000000001,
"calib/mu_w": 0.9787934782608694,
"calib/nonempty_final_conf_rate": 0.890625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3889868421052633,
"calib/std_conf": 0.06553487388041113,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40866153846153846,
"calib/step_q_c_n": 1950.0,
"calib/step_q_gap": -0.22232913378033997,
"calib/step_q_w": 0.6309906722418784,
"calib/step_q_w_n": 3109.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 2887.0,
"completions/max_terminated_length": 2887.0,
"completions/mean_length": 850.54296875,
"completions/mean_terminated_length": 950.8253173828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 358.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.3890380263328552,
"kl": 0.208526611328125,
"learning_rate": 3.277777777777778e-06,
"loss": -0.1941,
"mask/has_final_conf_rate": 0.890625,
"mask/share_final_conf": 0.015617813915014267,
"mask/share_reasoning": 0.717111349105835,
"mask/share_step_conf": 0.16180211305618286,
"num_tokens": 25732929.0,
"reward": 0.4802950620651245,
"reward_std": 0.20812444388866425,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.5424792766571045,
"rewards/format_reward_step": 0.890625,
"rewards/step_margin_reward": 0.13373583555221558,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.44959962368011475,
"adv/mean_abs_reasoning": 0.43477439880371094,
"adv/mean_abs_step_conf": 0.6621416807174683,
"adv/ratio_final_to_reasoning": 1.0340986610922716,
"adv/ratio_step_to_reasoning": 1.5229546232238196,
"adv/std_final_conf": 0.7216721177101135,
"adv/std_reasoning": 0.7207791805267334,
"adv/std_step_conf": 0.8756316900253296,
"calib/answer_extract_rate": 0.94140625,
"calib/avg_num_step_conf": 15.00390625,
"calib/ece": 0.3046721991701246,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.995850622406639,
"calib/gap": 0.002676794258373416,
"calib/mean_conf": 0.989319502074689,
"calib/mu_c": 0.9901636363636362,
"calib/mu_w": 0.9874868421052628,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3046721991701246,
"calib/std_conf": 0.012359148062504485,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42497302413629906,
"calib/step_q_c_n": 2113.0,
"calib/step_q_gap": -0.12437246197481205,
"calib/step_q_w": 0.5493454861111111,
"calib/step_q_w_n": 1728.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2879.0,
"completions/max_terminated_length": 2879.0,
"completions/mean_length": 759.96484375,
"completions/mean_terminated_length": 797.3401489257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 276.0,
"epoch": 0.0896,
"grad_norm": 0.716595470905304,
"kl": 0.25469970703125,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.1958,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.02004670724272728,
"mask/share_reasoning": 0.757233202457428,
"mask/share_step_conf": 0.17584508657455444,
"num_tokens": 26033400.0,
"reward": 0.6169133186340332,
"reward_std": 0.24483832716941833,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.6517096757888794,
"rewards/format_reward_step": 0.94140625,
"rewards/step_margin_reward": 0.2649293541908264,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.5315093994140625,
"adv/mean_abs_reasoning": 0.5010403394699097,
"adv/mean_abs_step_conf": 0.5558913946151733,
"adv/ratio_final_to_reasoning": 1.0608115904926707,
"adv/ratio_step_to_reasoning": 1.1094743293589793,
"adv/std_final_conf": 0.7596035003662109,
"adv/std_reasoning": 0.757909893989563,
"adv/std_step_conf": 0.7936156988143921,
"calib/answer_extract_rate": 0.890625,
"calib/avg_num_step_conf": 17.48828125,
"calib/ece": 0.2811409691629957,
"calib/final_conf_rate": 0.88671875,
"calib/format_rate": 0.88671875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0005741577263316255,
"calib/mean_conf": 0.9903920704845816,
"calib/mu_c": 0.9905590062111801,
"calib/mu_w": 0.9899848484848485,
"calib/nonempty_final_conf_rate": 0.88671875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2811409691629957,
"calib/std_conf": 0.0020522961534924387,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.45196431838975293,
"calib/step_q_c_n": 2186.0,
"calib/step_q_gap": -0.1981753586071917,
"calib/step_q_w": 0.6501396769969446,
"calib/step_q_w_n": 2291.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 3066.0,
"completions/max_terminated_length": 3066.0,
"completions/mean_length": 742.7890625,
"completions/mean_terminated_length": 823.177490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.3252770006656647,
"kl": 0.23199462890625,
"learning_rate": 3.2222222222222227e-06,
"loss": -0.2967,
"mask/has_final_conf_rate": 0.88671875,
"mask/share_final_conf": 0.01761934906244278,
"mask/share_reasoning": 0.7188683748245239,
"mask/share_step_conf": 0.1658560037612915,
"num_tokens": 26331378.0,
"reward": 0.570929765701294,
"reward_std": 0.2469976842403412,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6339846849441528,
"rewards/format_reward_step": 0.88671875,
"rewards/step_margin_reward": 0.20396846532821655,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.5568798184394836,
"adv/mean_abs_reasoning": 0.49567070603370667,
"adv/mean_abs_step_conf": 0.6302189230918884,
"adv/ratio_final_to_reasoning": 1.1234874517712867,
"adv/ratio_step_to_reasoning": 1.2714467799293998,
"adv/std_final_conf": 0.8091704845428467,
"adv/std_reasoning": 0.7755091786384583,
"adv/std_step_conf": 0.8597295880317688,
"calib/answer_extract_rate": 0.93359375,
"calib/avg_num_step_conf": 16.34375,
"calib/ece": 0.39535425383542533,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.99581589958159,
"calib/gap": -0.0038166520979023844,
"calib/mean_conf": 0.9881018131101812,
"calib/mu_c": 0.9865687645687643,
"calib/mu_w": 0.9903854166666667,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3925648535564853,
"calib/std_conf": 0.04252623935218979,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4565,
"calib/step_q_c_n": 1922.0,
"calib/step_q_gap": -0.16944783377542005,
"calib/step_q_w": 0.6259478337754201,
"calib/step_q_w_n": 2262.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2860.0,
"completions/max_terminated_length": 2860.0,
"completions/mean_length": 823.42578125,
"completions/mean_terminated_length": 878.3208618164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 301.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.5175958871841431,
"kl": 0.21533203125,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.2155,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.01790308579802513,
"mask/share_reasoning": 0.756227433681488,
"mask/share_step_conf": 0.16336949169635773,
"num_tokens": 26647687.0,
"reward": 0.5359945297241211,
"reward_std": 0.23279647529125214,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.5600782632827759,
"rewards/format_reward_step": 0.9296875,
"rewards/step_margin_reward": 0.21425451338291168,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.5304808616638184,
"adv/mean_abs_reasoning": 0.4583980441093445,
"adv/mean_abs_step_conf": 0.6418082118034363,
"adv/ratio_final_to_reasoning": 1.1572494003427283,
"adv/ratio_step_to_reasoning": 1.4001111480535502,
"adv/std_final_conf": 0.7625007629394531,
"adv/std_reasoning": 0.7397937178611755,
"adv/std_step_conf": 0.8598397374153137,
"calib/answer_extract_rate": 0.86328125,
"calib/avg_num_step_conf": 19.32421875,
"calib/ece": 0.16364705882352948,
"calib/final_conf_rate": 0.86328125,
"calib/format_rate": 0.86328125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00048777681909739634,
"calib/mean_conf": 0.9917013574660634,
"calib/mu_c": 0.9916174863387976,
"calib/mu_w": 0.992105263157895,
"calib/nonempty_final_conf_rate": 0.86328125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16364705882352948,
"calib/std_conf": 0.0035937915248107737,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4828537754694367,
"calib/step_q_c_n": 2503.0,
"calib/step_q_gap": -0.2960459790313817,
"calib/step_q_w": 0.7788997545008184,
"calib/step_q_w_n": 2444.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.12890625,
"completions/max_length": 3005.0,
"completions/max_terminated_length": 3005.0,
"completions/mean_length": 702.1953125,
"completions/mean_terminated_length": 806.107666015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 288.0,
"epoch": 0.0928,
"grad_norm": 0.983510434627533,
"kl": 0.241546630859375,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.4124,
"mask/has_final_conf_rate": 0.86328125,
"mask/share_final_conf": 0.017843790352344513,
"mask/share_reasoning": 0.6948947906494141,
"mask/share_step_conf": 0.1583552062511444,
"num_tokens": 26932945.0,
"reward": 0.6232060194015503,
"reward_std": 0.24949079751968384,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7171169519424438,
"rewards/format_reward_step": 0.86328125,
"rewards/step_margin_reward": 0.21367013454437256,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.5327442288398743,
"adv/mean_abs_reasoning": 0.4311426281929016,
"adv/mean_abs_step_conf": 0.6246463060379028,
"adv/ratio_final_to_reasoning": 1.2356565878740111,
"adv/ratio_step_to_reasoning": 1.4488159258481486,
"adv/std_final_conf": 0.7479230165481567,
"adv/std_reasoning": 0.7206476926803589,
"adv/std_step_conf": 0.8437591791152954,
"calib/answer_extract_rate": 0.9609375,
"calib/avg_num_step_conf": 15.23046875,
"calib/ece": 0.24886097154471543,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00013218293000238912,
"calib/mean_conf": 0.9927634105691057,
"calib/mu_c": 0.9927972622950819,
"calib/mu_w": 0.9926650793650795,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24886097154471543,
"calib/std_conf": 0.004290628973701832,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4629473269519519,
"calib/step_q_c_n": 2664.0,
"calib/step_q_gap": -0.0537554260844853,
"calib/step_q_w": 0.5167027530364372,
"calib/step_q_w_n": 1235.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 1796.0,
"completions/max_terminated_length": 1796.0,
"completions/mean_length": 803.13671875,
"completions/mean_terminated_length": 835.7845458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 357.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.5124315619468689,
"kl": 0.22320556640625,
"learning_rate": 3.138888888888889e-06,
"loss": -0.0945,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.018675651401281357,
"mask/share_reasoning": 0.7699906826019287,
"mask/share_step_conf": 0.17227113246917725,
"num_tokens": 27248396.0,
"reward": 0.6558331251144409,
"reward_std": 0.2364097386598587,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7183858752250671,
"rewards/format_reward_step": 0.9609375,
"rewards/step_margin_reward": 0.2581240236759186,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.6126490235328674,
"adv/mean_abs_reasoning": 0.4717230200767517,
"adv/mean_abs_step_conf": 0.5741785764694214,
"adv/ratio_final_to_reasoning": 1.2987473527011388,
"adv/ratio_step_to_reasoning": 1.217194311136225,
"adv/std_final_conf": 0.7811753153800964,
"adv/std_reasoning": 0.7207298874855042,
"adv/std_step_conf": 0.8108007311820984,
"calib/answer_extract_rate": 0.94140625,
"calib/avg_num_step_conf": 15.31640625,
"calib/ece": 0.39068423236514527,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.002062392241379274,
"calib/mean_conf": 0.9923439834024896,
"calib/mu_c": 0.9931655172413794,
"calib/mu_w": 0.9911031250000001,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39068423236514527,
"calib/std_conf": 0.00506345370134029,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4329008875739645,
"calib/step_q_c_n": 2028.0,
"calib/step_q_gap": -0.09478849435947445,
"calib/step_q_w": 0.527689381933439,
"calib/step_q_w_n": 1893.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2948.0,
"completions/max_terminated_length": 2948.0,
"completions/mean_length": 875.5390625,
"completions/mean_terminated_length": 914.8489379882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 275.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.6323784589767456,
"kl": 0.19537353515625,
"learning_rate": 3.1111111111111116e-06,
"loss": -0.1561,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.01836657151579857,
"mask/share_reasoning": 0.7712298631668091,
"mask/share_step_conf": 0.16743478178977966,
"num_tokens": 27581422.0,
"reward": 0.5453107357025146,
"reward_std": 0.2122250199317932,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.5729995965957642,
"rewards/format_reward_step": 0.94140625,
"rewards/step_margin_reward": 0.21605932712554932,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.545224666595459,
"adv/mean_abs_reasoning": 0.4193390905857086,
"adv/mean_abs_step_conf": 0.6760151982307434,
"adv/ratio_final_to_reasoning": 1.3001999547285723,
"adv/ratio_step_to_reasoning": 1.6120967813578373,
"adv/std_final_conf": 0.745468020439148,
"adv/std_reasoning": 0.7014575600624084,
"adv/std_step_conf": 0.8911756277084351,
"calib/answer_extract_rate": 0.94140625,
"calib/avg_num_step_conf": 16.046875,
"calib/ece": 0.3187395573997232,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.991701244813278,
"calib/gap": -0.006903259951430596,
"calib/mean_conf": 0.9881724757952973,
"calib/mu_c": 0.985966869918699,
"calib/mu_w": 0.9928701298701296,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31320705394190856,
"calib/std_conf": 0.060135680947406664,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4130003935286401,
"calib/step_q_c_n": 2287.0,
"calib/step_q_gap": -0.12124067670139232,
"calib/step_q_w": 0.5342410702300324,
"calib/step_q_w_n": 1821.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2843.0,
"completions/max_terminated_length": 2843.0,
"completions/mean_length": 819.77734375,
"completions/mean_terminated_length": 863.6337280273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 277.0,
"epoch": 0.096,
"grad_norm": 0.7133516669273376,
"kl": 0.212066650390625,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.1441,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.018856745213270187,
"mask/share_reasoning": 0.7539302110671997,
"mask/share_step_conf": 0.1764318197965622,
"num_tokens": 27894605.0,
"reward": 0.6118367910385132,
"reward_std": 0.22222277522087097,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6413779258728027,
"rewards/format_reward_step": 0.94140625,
"rewards/step_margin_reward": 0.26588934659957886,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.5682475566864014,
"adv/mean_abs_reasoning": 0.44651511311531067,
"adv/mean_abs_step_conf": 0.6393174529075623,
"adv/ratio_final_to_reasoning": 1.272627823774587,
"adv/ratio_step_to_reasoning": 1.431793536498867,
"adv/std_final_conf": 0.7608886361122131,
"adv/std_reasoning": 0.7014663219451904,
"adv/std_step_conf": 0.8439959287643433,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 14.921875,
"calib/ece": 0.3113582995951417,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9919028340080972,
"calib/gap": -0.010849006875477074,
"calib/mean_conf": 0.9835850202429149,
"calib/mu_c": 0.9802029411764706,
"calib/mu_w": 0.9910519480519476,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3033421052631579,
"calib/std_conf": 0.0881249851884996,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.3538924684370918,
"calib/step_q_c_n": 2297.0,
"calib/step_q_gap": -0.08279893011839073,
"calib/step_q_w": 0.43669139855548256,
"calib/step_q_w_n": 1523.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2270.0,
"completions/max_terminated_length": 2270.0,
"completions/mean_length": 815.0703125,
"completions/mean_terminated_length": 844.7692260742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 356.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.8096587061882019,
"kl": 0.208343505859375,
"learning_rate": 3.055555555555556e-06,
"loss": -0.1027,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.018055833876132965,
"mask/share_reasoning": 0.7757927179336548,
"mask/share_step_conf": 0.17099520564079285,
"num_tokens": 28210975.0,
"reward": 0.618476390838623,
"reward_std": 0.23961031436920166,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.6538805961608887,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.2588534653186798,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.47137245535850525,
"adv/mean_abs_reasoning": 0.3835318684577942,
"adv/mean_abs_step_conf": 0.6305508017539978,
"adv/ratio_final_to_reasoning": 1.2290307380555456,
"adv/ratio_step_to_reasoning": 1.64406364532231,
"adv/std_final_conf": 0.7121309041976929,
"adv/std_reasoning": 0.6816805005073547,
"adv/std_step_conf": 0.8598589301109314,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 14.89453125,
"calib/ece": 0.21645403225806456,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9798387096774194,
"calib/gap": 0.03951386569872939,
"calib/mean_conf": 0.9825830645161291,
"calib/mu_c": 0.9918242105263158,
"calib/mu_w": 0.9523103448275864,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21645403225806456,
"calib/std_conf": 0.07772657796961607,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3606782115065697,
"calib/step_q_c_n": 2613.0,
"calib/step_q_gap": -0.00377512182676365,
"calib/step_q_w": 0.36445333333333335,
"calib/step_q_w_n": 1200.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2696.0,
"completions/max_terminated_length": 2696.0,
"completions/mean_length": 802.26171875,
"completions/mean_terminated_length": 824.8152465820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 306.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.5272431969642639,
"kl": 0.210418701171875,
"learning_rate": 3.0277777777777776e-06,
"loss": -0.0923,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.02028692699968815,
"mask/share_reasoning": 0.7759718894958496,
"mask/share_step_conf": 0.17639735341072083,
"num_tokens": 28523074.0,
"reward": 0.6903301477432251,
"reward_std": 0.19578728079795837,
"rewards/accuracy_reward_step": 0.7421875,
"rewards/final_brier_reward_step": 0.7576503753662109,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.2808223366737366,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.6249883770942688,
"adv/mean_abs_reasoning": 0.4877223074436188,
"adv/mean_abs_step_conf": 0.7049061059951782,
"adv/ratio_final_to_reasoning": 1.2814430825814096,
"adv/ratio_step_to_reasoning": 1.4453021632123442,
"adv/std_final_conf": 0.7965861558914185,
"adv/std_reasoning": 0.739631712436676,
"adv/std_step_conf": 0.8913946151733398,
"calib/answer_extract_rate": 0.93359375,
"calib/avg_num_step_conf": 15.8984375,
"calib/ece": 0.27326973500697344,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.9707112970711297,
"calib/gap": 0.04162226972948824,
"calib/mean_conf": 0.9731302649930265,
"calib/mu_c": 0.9854950396825398,
"calib/mu_w": 0.9438727699530516,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27173556485355643,
"calib/std_conf": 0.11830748467313208,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3847459764985869,
"calib/step_q_c_n": 2241.0,
"calib/step_q_gap": -0.03284241424316636,
"calib/step_q_w": 0.41758839074175325,
"calib/step_q_w_n": 1829.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2889.0,
"completions/max_terminated_length": 2889.0,
"completions/mean_length": 785.74609375,
"completions/mean_terminated_length": 834.6514892578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 240.0,
"epoch": 0.0992,
"grad_norm": 0.7500261664390564,
"kl": 0.21484375,
"learning_rate": 3e-06,
"loss": -0.1888,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.01931125298142433,
"mask/share_reasoning": 0.7512341141700745,
"mask/share_step_conf": 0.17086084187030792,
"num_tokens": 28830001.0,
"reward": 0.6308143138885498,
"reward_std": 0.28138795495033264,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.6736418008804321,
"rewards/format_reward_step": 0.93359375,
"rewards/step_margin_reward": 0.27001798152923584,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.5569735765457153,
"adv/mean_abs_reasoning": 0.3944690227508545,
"adv/mean_abs_step_conf": 0.5707467794418335,
"adv/ratio_final_to_reasoning": 1.4119577062392001,
"adv/ratio_step_to_reasoning": 1.4468735097668632,
"adv/std_final_conf": 0.773460865020752,
"adv/std_reasoning": 0.681592583656311,
"adv/std_step_conf": 0.8110101819038391,
"calib/answer_extract_rate": 0.94140625,
"calib/avg_num_step_conf": 14.4765625,
"calib/ece": 0.32540532503457825,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.983402489626556,
"calib/gap": 0.009855063014403309,
"calib/mean_conf": 0.9865394882434302,
"calib/mu_c": 0.9898517708333333,
"calib/mu_w": 0.97999670781893,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3240221991701246,
"calib/std_conf": 0.061612208320076564,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.32253509980568806,
"calib/step_q_c_n": 1887.0,
"calib/step_q_gap": -0.14021302187288992,
"calib/step_q_w": 0.462748121678578,
"calib/step_q_w_n": 1819.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2963.0,
"completions/max_terminated_length": 2963.0,
"completions/mean_length": 707.46484375,
"completions/mean_terminated_length": 748.3925170898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 284.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.6596551537513733,
"kl": 0.230255126953125,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.131,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.021078769117593765,
"mask/share_reasoning": 0.7578392624855042,
"mask/share_step_conf": 0.16639444231987,
"num_tokens": 29119792.0,
"reward": 0.5926636457443237,
"reward_std": 0.20104211568832397,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.6339141130447388,
"rewards/format_reward_step": 0.9375,
"rewards/step_margin_reward": 0.23891310393810272,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.4981163740158081,
"adv/mean_abs_reasoning": 0.33328384160995483,
"adv/mean_abs_step_conf": 0.6746397018432617,
"adv/ratio_final_to_reasoning": 1.4945710287351954,
"adv/ratio_step_to_reasoning": 2.0242196518869906,
"adv/std_final_conf": 0.6904189586639404,
"adv/std_reasoning": 0.6186611652374268,
"adv/std_step_conf": 0.8758255839347839,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 15.515625,
"calib/ece": 0.2768244979919678,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9879518072289156,
"calib/gap": 0.021064312617702696,
"calib/mean_conf": 0.9876678714859438,
"calib/mu_c": 0.993758757062147,
"calib/mu_w": 0.9726944444444443,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2768244979919678,
"calib/std_conf": 0.07046846977775822,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.31826907868722815,
"calib/step_q_c_n": 2529.0,
"calib/step_q_gap": 0.05056060097875048,
"calib/step_q_w": 0.26770847770847767,
"calib/step_q_w_n": 1443.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2468.0,
"completions/max_terminated_length": 2468.0,
"completions/mean_length": 846.015625,
"completions/mean_terminated_length": 869.7991943359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 291.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.6777894496917725,
"kl": 0.204498291015625,
"learning_rate": 2.944444444444445e-06,
"loss": -0.116,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.01932145655155182,
"mask/share_reasoning": 0.7791246771812439,
"mask/share_step_conf": 0.1742100864648819,
"num_tokens": 29442500.0,
"reward": 0.6396147608757019,
"reward_std": 0.21907824277877808,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.7017877101898193,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.24462932348251343,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.4814407229423523,
"adv/mean_abs_reasoning": 0.34680551290512085,
"adv/mean_abs_step_conf": 0.6026292443275452,
"adv/ratio_final_to_reasoning": 1.388215310966135,
"adv/ratio_step_to_reasoning": 1.7376576262569754,
"adv/std_final_conf": 0.7257750034332275,
"adv/std_reasoning": 0.6612364649772644,
"adv/std_step_conf": 0.8277370929718018,
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 14.5546875,
"calib/ece": 0.21818557823129242,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9551020408163265,
"calib/gap": 0.04885828924162239,
"calib/mean_conf": 0.9678454421768707,
"calib/mu_c": 0.9790130511463844,
"calib/mu_w": 0.930154761904762,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20730122448979582,
"calib/std_conf": 0.14024968713784997,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37112126843657817,
"calib/step_q_c_n": 2260.0,
"calib/step_q_gap": -0.0931903050058957,
"calib/step_q_w": 0.46431157344247387,
"calib/step_q_w_n": 1466.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2812.0,
"completions/max_terminated_length": 2812.0,
"completions/mean_length": 712.390625,
"completions/mean_terminated_length": 741.3495483398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 233.0,
"epoch": 0.1024,
"grad_norm": 1.1299368143081665,
"kl": 0.2353515625,
"learning_rate": 2.916666666666667e-06,
"loss": -0.0482,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.02144728973507881,
"mask/share_reasoning": 0.761890172958374,
"mask/share_step_conf": 0.17759999632835388,
"num_tokens": 29730688.0,
"reward": 0.6556459665298462,
"reward_std": 0.2049870640039444,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7490242719650269,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.223205104470253,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.6165008544921875,
"adv/mean_abs_reasoning": 0.46222391724586487,
"adv/mean_abs_step_conf": 0.6652477383613586,
"adv/ratio_final_to_reasoning": 1.3337709960262831,
"adv/ratio_step_to_reasoning": 1.4392326176568269,
"adv/std_final_conf": 0.7992217540740967,
"adv/std_reasoning": 0.7207860946655273,
"adv/std_step_conf": 0.8758837580680847,
"calib/answer_extract_rate": 0.9375,
"calib/avg_num_step_conf": 14.47265625,
"calib/ece": 0.310992638888889,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.9541666666666667,
"calib/gap": 0.04828318001939369,
"calib/mean_conf": 0.9623815277777779,
"calib/mu_c": 0.9782747412008281,
"calib/mu_w": 0.9299915611814344,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30127041666666676,
"calib/std_conf": 0.15611747531455863,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4241068054850178,
"calib/step_q_c_n": 1969.0,
"calib/step_q_gap": 0.025339830638627603,
"calib/step_q_w": 0.3987669748463902,
"calib/step_q_w_n": 1736.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2502.0,
"completions/max_terminated_length": 2502.0,
"completions/mean_length": 701.59765625,
"completions/mean_terminated_length": 745.265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.10346666666666667,
"grad_norm": 1.2831897735595703,
"kl": 0.230224609375,
"learning_rate": 2.888888888888889e-06,
"loss": -0.1498,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.02083716168999672,
"mask/share_reasoning": 0.7508844137191772,
"mask/share_step_conf": 0.16968463361263275,
"num_tokens": 30015369.0,
"reward": 0.647125244140625,
"reward_std": 0.24315395951271057,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6479384899139404,
"rewards/format_reward_step": 0.9375,
"rewards/step_margin_reward": 0.3330307602882385,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.5435746908187866,
"adv/mean_abs_reasoning": 0.4460451602935791,
"adv/mean_abs_step_conf": 0.6659544706344604,
"adv/ratio_final_to_reasoning": 1.2186539373300571,
"adv/ratio_step_to_reasoning": 1.493020281166465,
"adv/std_final_conf": 0.7484281063079834,
"adv/std_reasoning": 0.7395012974739075,
"adv/std_step_conf": 0.875845730304718,
"calib/answer_extract_rate": 0.94921875,
"calib/avg_num_step_conf": 14.78125,
"calib/ece": 0.325366803840878,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.9753086419753086,
"calib/gap": 0.03632447791164661,
"calib/mean_conf": 0.9715478737997257,
"calib/mu_c": 0.9839549999999999,
"calib/mu_w": 0.9476305220883533,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3192392318244171,
"calib/std_conf": 0.1361281358591226,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4082570292620865,
"calib/step_q_c_n": 2096.0,
"calib/step_q_gap": -0.06560868953728155,
"calib/step_q_w": 0.47386571879936806,
"calib/step_q_w_n": 1688.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2882.0,
"completions/max_terminated_length": 2882.0,
"completions/mean_length": 749.0703125,
"completions/mean_terminated_length": 782.7020263671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 241.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.5426382422447205,
"kl": 0.229400634765625,
"learning_rate": 2.861111111111111e-06,
"loss": -0.1304,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.020803090184926987,
"mask/share_reasoning": 0.7624001502990723,
"mask/share_step_conf": 0.17382797598838806,
"num_tokens": 30313315.0,
"reward": 0.5548592805862427,
"reward_std": 0.24476754665374756,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.6406000852584839,
"rewards/format_reward_step": 0.94921875,
"rewards/step_margin_reward": 0.15427470207214355,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.6283714771270752,
"adv/mean_abs_reasoning": 0.49294739961624146,
"adv/mean_abs_step_conf": 0.5275165438652039,
"adv/ratio_final_to_reasoning": 1.2747231806400867,
"adv/ratio_step_to_reasoning": 1.0701274502632012,
"adv/std_final_conf": 0.8293971419334412,
"adv/std_reasoning": 0.7396363019943237,
"adv/std_step_conf": 0.7765645384788513,
"calib/answer_extract_rate": 0.89453125,
"calib/avg_num_step_conf": 17.609375,
"calib/ece": 0.427187768558952,
"calib/final_conf_rate": 0.89453125,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.9082969432314411,
"calib/gap": 0.002509178710178861,
"calib/mean_conf": 0.9253537074235808,
"calib/mu_c": 0.9264384615384617,
"calib/mu_w": 0.9239292828282828,
"calib/nonempty_final_conf_rate": 0.89453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3924279432314411,
"calib/std_conf": 0.23150664868316992,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4221572494669509,
"calib/step_q_c_n": 1876.0,
"calib/step_q_gap": -0.018625883131833276,
"calib/step_q_w": 0.44078313259878416,
"calib/step_q_w_n": 2632.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 3029.0,
"completions/max_terminated_length": 3029.0,
"completions/mean_length": 838.328125,
"completions/mean_terminated_length": 929.0562744140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 249.0,
"epoch": 0.1056,
"grad_norm": 0.6395625472068787,
"kl": 0.195220947265625,
"learning_rate": 2.8333333333333335e-06,
"loss": -0.2201,
"mask/has_final_conf_rate": 0.89453125,
"mask/share_final_conf": 0.017325591295957565,
"mask/share_reasoning": 0.7251094579696655,
"mask/share_step_conf": 0.15990865230560303,
"num_tokens": 30633727.0,
"reward": 0.4642782509326935,
"reward_std": 0.25697386264801025,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.5137213468551636,
"rewards/format_reward_step": 0.890625,
"rewards/step_margin_reward": 0.13514766097068787,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.6297253966331482,
"adv/mean_abs_reasoning": 0.46328994631767273,
"adv/mean_abs_step_conf": 0.6763242483139038,
"adv/ratio_final_to_reasoning": 1.3592468423680244,
"adv/ratio_step_to_reasoning": 1.4598293221976284,
"adv/std_final_conf": 0.8469195365905762,
"adv/std_reasoning": 0.7577730417251587,
"adv/std_step_conf": 0.9062535762786865,
"calib/answer_extract_rate": 0.921875,
"calib/avg_num_step_conf": 16.05078125,
"calib/ece": 0.249825,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.8983050847457628,
"calib/gap": 0.1654154810298103,
"calib/mean_conf": 0.9206927966101696,
"calib/mu_c": 0.9711585365853659,
"calib/mu_w": 0.8057430555555556,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23780127118644068,
"calib/std_conf": 0.23762598061704632,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42712190565156466,
"calib/step_q_c_n": 2141.0,
"calib/step_q_gap": -0.005995872549654835,
"calib/step_q_w": 0.4331177782012195,
"calib/step_q_w_n": 1968.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2555.0,
"completions/max_terminated_length": 2555.0,
"completions/mean_length": 780.60546875,
"completions/mean_terminated_length": 843.1856079101562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 275.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.790928304195404,
"kl": 0.2076416015625,
"learning_rate": 2.805555555555556e-06,
"loss": -0.2755,
"mask/has_final_conf_rate": 0.921875,
"mask/share_final_conf": 0.018736468628048897,
"mask/share_reasoning": 0.7489428520202637,
"mask/share_step_conf": 0.1581019014120102,
"num_tokens": 30940970.0,
"reward": 0.617317795753479,
"reward_std": 0.25454002618789673,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6920415759086609,
"rewards/format_reward_step": 0.921875,
"rewards/step_margin_reward": 0.23009398579597473,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.6387467384338379,
"adv/mean_abs_reasoning": 0.5353177785873413,
"adv/mean_abs_step_conf": 0.6979807615280151,
"adv/ratio_final_to_reasoning": 1.1932103957380922,
"adv/ratio_step_to_reasoning": 1.3038624709418913,
"adv/std_final_conf": 0.8217288255691528,
"adv/std_reasoning": 0.775704562664032,
"adv/std_step_conf": 0.8914257884025574,
"calib/answer_extract_rate": 0.91796875,
"calib/avg_num_step_conf": 17.5625,
"calib/ece": 0.3170382978723405,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.8936170212765957,
"calib/gap": 0.1124597315436241,
"calib/mean_conf": 0.9138042553191488,
"calib/mu_c": 0.9549597315436241,
"calib/mu_w": 0.8425,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.29840000000000005,
"calib/std_conf": 0.24286319584652155,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.39851011029411765,
"calib/step_q_c_n": 2176.0,
"calib/step_q_gap": -0.026448510395537528,
"calib/step_q_w": 0.4249586206896552,
"calib/step_q_w_n": 2320.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2930.0,
"completions/max_terminated_length": 2930.0,
"completions/mean_length": 806.91015625,
"completions/mean_terminated_length": 871.59912109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 341.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.9349942803382874,
"kl": 0.204559326171875,
"learning_rate": 2.7777777777777783e-06,
"loss": -0.1962,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.017822718247771263,
"mask/share_reasoning": 0.7434788942337036,
"mask/share_step_conf": 0.16447967290878296,
"num_tokens": 31254531.0,
"reward": 0.5653898119926453,
"reward_std": 0.2803936004638672,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6268872022628784,
"rewards/format_reward_step": 0.91796875,
"rewards/step_margin_reward": 0.20389243960380554,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.6121485829353333,
"adv/mean_abs_reasoning": 0.4123254418373108,
"adv/mean_abs_step_conf": 0.5459374189376831,
"adv/ratio_final_to_reasoning": 1.484624815310004,
"adv/ratio_step_to_reasoning": 1.3240449497974247,
"adv/std_final_conf": 0.8249315023422241,
"adv/std_reasoning": 0.6818413138389587,
"adv/std_step_conf": 0.7938175797462463,
"calib/answer_extract_rate": 0.9375,
"calib/avg_num_step_conf": 14.37890625,
"calib/ece": 0.29191631799163176,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.9121338912133892,
"calib/gap": 0.05567853134519829,
"calib/mean_conf": 0.9319999999999999,
"calib/mu_c": 0.9499382716049384,
"calib/mu_w": 0.8942597402597401,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.27304602510460246,
"calib/std_conf": 0.20579792664378005,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.42031159793814427,
"calib/step_q_c_n": 1940.0,
"calib/step_q_gap": 0.06465438943728269,
"calib/step_q_w": 0.3556572085008616,
"calib/step_q_w_n": 1741.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2948.0,
"completions/max_terminated_length": 2948.0,
"completions/mean_length": 679.98046875,
"completions/mean_terminated_length": 722.3029174804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.1088,
"grad_norm": 0.8080055713653564,
"kl": 0.25323486328125,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.1598,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.022250540554523468,
"mask/share_reasoning": 0.7478595972061157,
"mask/share_step_conf": 0.1712961196899414,
"num_tokens": 31535302.0,
"reward": 0.5693697929382324,
"reward_std": 0.23744550347328186,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.6525646448135376,
"rewards/format_reward_step": 0.93359375,
"rewards/step_margin_reward": 0.17211249470710754,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.5998444557189941,
"adv/mean_abs_reasoning": 0.40662068128585815,
"adv/mean_abs_step_conf": 0.6629431247711182,
"adv/ratio_final_to_reasoning": 1.4751941633221992,
"adv/ratio_step_to_reasoning": 1.6303723722922567,
"adv/std_final_conf": 0.8152146339416504,
"adv/std_reasoning": 0.7015076279640198,
"adv/std_step_conf": 0.8602816462516785,
"calib/answer_extract_rate": 0.921875,
"calib/avg_num_step_conf": 16.20703125,
"calib/ece": 0.2672309322033899,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.885593220338983,
"calib/gap": 0.1465941734417343,
"calib/mean_conf": 0.8972733050847458,
"calib/mu_c": 0.9419969512195122,
"calib/mu_w": 0.7954027777777779,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23479449152542375,
"calib/std_conf": 0.27585804982234663,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4018286864490603,
"calib/step_q_c_n": 2022.0,
"calib/step_q_gap": 0.074487830783804,
"calib/step_q_w": 0.3273408556652563,
"calib/step_q_w_n": 2127.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2679.0,
"completions/max_terminated_length": 2679.0,
"completions/mean_length": 852.88671875,
"completions/mean_terminated_length": 909.7459106445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 281.0,
"epoch": 0.10986666666666667,
"grad_norm": 1.186505675315857,
"kl": 0.198944091796875,
"learning_rate": 2.7222222222222224e-06,
"loss": -0.1797,
"mask/has_final_conf_rate": 0.921875,
"mask/share_final_conf": 0.018817514181137085,
"mask/share_reasoning": 0.7602624893188477,
"mask/share_step_conf": 0.15841998159885406,
"num_tokens": 31858193.0,
"reward": 0.5926206111907959,
"reward_std": 0.25742650032043457,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.675830066204071,
"rewards/format_reward_step": 0.921875,
"rewards/step_margin_reward": 0.19691118597984314,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.6392145156860352,
"adv/mean_abs_reasoning": 0.47575998306274414,
"adv/mean_abs_step_conf": 0.6251335144042969,
"adv/ratio_final_to_reasoning": 1.3435651135916034,
"adv/ratio_step_to_reasoning": 1.313968254286433,
"adv/std_final_conf": 0.8539537787437439,
"adv/std_reasoning": 0.7577002048492432,
"adv/std_step_conf": 0.8432413339614868,
"calib/answer_extract_rate": 0.93359375,
"calib/avg_num_step_conf": 15.88671875,
"calib/ece": 0.34682083333333336,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.825,
"calib/gap": 0.14368683529749926,
"calib/mean_conf": 0.8533208333333333,
"calib/mu_c": 0.9119929577464788,
"calib/mu_w": 0.7683061224489796,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30423750000000005,
"calib/std_conf": 0.3230028785722694,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.47077692307692304,
"calib/step_q_c_n": 1820.0,
"calib/step_q_gap": 0.0288770565882715,
"calib/step_q_w": 0.44189986648865154,
"calib/step_q_w_n": 2247.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2956.0,
"completions/max_terminated_length": 2956.0,
"completions/mean_length": 754.546875,
"completions/mean_terminated_length": 804.8500366210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 246.0,
"epoch": 0.11093333333333333,
"grad_norm": 1.338552713394165,
"kl": 0.223480224609375,
"learning_rate": 2.6944444444444444e-06,
"loss": -0.2012,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.0200947392731905,
"mask/share_reasoning": 0.7502679228782654,
"mask/share_step_conf": 0.16713735461235046,
"num_tokens": 32158037.0,
"reward": 0.574633777141571,
"reward_std": 0.2656676769256592,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6113731861114502,
"rewards/format_reward_step": 0.93359375,
"rewards/step_margin_reward": 0.2402380108833313,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.5889736413955688,
"adv/mean_abs_reasoning": 0.480373740196228,
"adv/mean_abs_step_conf": 0.5982500314712524,
"adv/ratio_final_to_reasoning": 1.2260737673857418,
"adv/ratio_step_to_reasoning": 1.2453845441819387,
"adv/std_final_conf": 0.7930524349212646,
"adv/std_reasoning": 0.7577711939811707,
"adv/std_step_conf": 0.8277463316917419,
"calib/answer_extract_rate": 0.91796875,
"calib/avg_num_step_conf": 16.32421875,
"calib/ece": 0.21246808510638282,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.9106382978723404,
"calib/gap": 0.13089104291934484,
"calib/mean_conf": 0.9232765957446807,
"calib/mu_c": 0.9527967032967033,
"calib/mu_w": 0.8219056603773585,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18063829787234026,
"calib/std_conf": 0.24153592765023144,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.410578872614292,
"calib/step_q_c_n": 2253.0,
"calib/step_q_gap": -0.012402866741886565,
"calib/step_q_w": 0.42298173935617855,
"calib/step_q_w_n": 1926.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2728.0,
"completions/max_terminated_length": 2728.0,
"completions/mean_length": 747.4609375,
"completions/mean_terminated_length": 810.8051147460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 262.0,
"epoch": 0.112,
"grad_norm": 1.060556411743164,
"kl": 0.228790283203125,
"learning_rate": 2.666666666666667e-06,
"loss": -0.2401,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.019843213260173798,
"mask/share_reasoning": 0.7385556697845459,
"mask/share_step_conf": 0.16347616910934448,
"num_tokens": 32455147.0,
"reward": 0.6941068172454834,
"reward_std": 0.25616464018821716,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.7257221341133118,
"rewards/format_reward_step": 0.91796875,
"rewards/step_margin_reward": 0.33671021461486816,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.5451048612594604,
"adv/mean_abs_reasoning": 0.41657865047454834,
"adv/mean_abs_step_conf": 0.614454984664917,
"adv/ratio_final_to_reasoning": 1.3085280790038103,
"adv/ratio_step_to_reasoning": 1.4750035412639522,
"adv/std_final_conf": 0.7802751064300537,
"adv/std_reasoning": 0.7014001607894897,
"adv/std_step_conf": 0.8601456880569458,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 14.15234375,
"calib/ece": 0.31559109311740885,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9068825910931174,
"calib/gap": 0.1770145704467352,
"calib/mean_conf": 0.9144574898785426,
"calib/mu_c": 0.9839733333333334,
"calib/mu_w": 0.8069587628865982,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31138056680161935,
"calib/std_conf": 0.2604006668620519,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42816789297658864,
"calib/step_q_c_n": 1794.0,
"calib/step_q_gap": -0.026669668532432633,
"calib/step_q_w": 0.45483756150902127,
"calib/step_q_w_n": 1829.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2030.0,
"completions/max_terminated_length": 2030.0,
"completions/mean_length": 743.984375,
"completions/mean_terminated_length": 771.0931396484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 337.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.9969174861907959,
"kl": 0.2369384765625,
"learning_rate": 2.6388888888888893e-06,
"loss": -0.0807,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.020107656717300415,
"mask/share_reasoning": 0.7740259170532227,
"mask/share_step_conf": 0.17071017622947693,
"num_tokens": 32750191.0,
"reward": 0.5974287986755371,
"reward_std": 0.22962352633476257,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6597416400909424,
"rewards/format_reward_step": 0.96484375,
"rewards/step_margin_reward": 0.22495976090431213,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.5846522450447083,
"adv/mean_abs_reasoning": 0.41004979610443115,
"adv/mean_abs_step_conf": 0.6936636567115784,
"adv/ratio_final_to_reasoning": 1.4258079155240198,
"adv/ratio_step_to_reasoning": 1.6916571189683427,
"adv/std_final_conf": 0.7986034154891968,
"adv/std_reasoning": 0.7014772295951843,
"adv/std_step_conf": 0.891394317150116,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 14.63671875,
"calib/ece": 0.26622419999999986,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.896,
"calib/gap": 0.1674748723073506,
"calib/mean_conf": 0.9120718000000001,
"calib/mu_c": 0.9649938596491228,
"calib/mu_w": 0.7975189873417722,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24714799999999987,
"calib/std_conf": 0.2620013802535399,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3906449145299145,
"calib/step_q_c_n": 2340.0,
"calib/step_q_gap": 0.0008439166681896149,
"calib/step_q_w": 0.3898009978617249,
"calib/step_q_w_n": 1403.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2938.0,
"completions/max_terminated_length": 2938.0,
"completions/mean_length": 765.4609375,
"completions/mean_terminated_length": 783.83203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 224.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.9824727773666382,
"kl": 0.240814208984375,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.0456,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.020465373992919922,
"mask/share_reasoning": 0.7757344245910645,
"mask/share_step_conf": 0.18036270141601562,
"num_tokens": 33050765.0,
"reward": 0.6862397193908691,
"reward_std": 0.25337162613868713,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7182737588882446,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.3260806202888489,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.4626810848712921,
"adv/mean_abs_reasoning": 0.30211400985717773,
"adv/mean_abs_step_conf": 0.5840635299682617,
"adv/ratio_final_to_reasoning": 1.5314784146886182,
"adv/ratio_step_to_reasoning": 1.9332553635773913,
"adv/std_final_conf": 0.724582314491272,
"adv/std_reasoning": 0.6183889508247375,
"adv/std_step_conf": 0.8277202248573303,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 14.48046875,
"calib/ece": 0.2529601593625499,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9083665338645418,
"calib/gap": 0.028346978021977942,
"calib/mean_conf": 0.9171832669322709,
"calib/mu_c": 0.9235076923076924,
"calib/mu_w": 0.8951607142857144,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19662549800796822,
"calib/std_conf": 0.2634861712294742,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41951341695172933,
"calib/step_q_c_n": 2631.0,
"calib/step_q_gap": 0.028675312862510016,
"calib/step_q_w": 0.3908381040892193,
"calib/step_q_w_n": 1076.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3067.0,
"completions/max_terminated_length": 3067.0,
"completions/mean_length": 792.234375,
"completions/mean_terminated_length": 808.0159301757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 204.0,
"epoch": 0.1152,
"grad_norm": 0.8382714986801147,
"kl": 0.227691650390625,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.0008,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.020667005330324173,
"mask/share_reasoning": 0.7799920439720154,
"mask/share_step_conf": 0.17980965971946716,
"num_tokens": 33356809.0,
"reward": 0.653134286403656,
"reward_std": 0.1993504762649536,
"rewards/accuracy_reward_step": 0.76171875,
"rewards/final_brier_reward_step": 0.7327922582626343,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.22503873705863953,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.4827290177345276,
"adv/mean_abs_reasoning": 0.44099271297454834,
"adv/mean_abs_step_conf": 0.5236935615539551,
"adv/ratio_final_to_reasoning": 1.094641710695088,
"adv/ratio_step_to_reasoning": 1.1875333676640134,
"adv/std_final_conf": 0.7577349543571472,
"adv/std_reasoning": 0.7205712795257568,
"adv/std_step_conf": 0.7936491370201111,
"calib/answer_extract_rate": 0.9140625,
"calib/avg_num_step_conf": 15.640625,
"calib/ece": 0.42427350427350435,
"calib/final_conf_rate": 0.9140625,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 0.9401709401709402,
"calib/gap": 0.0813558201058201,
"calib/mean_conf": 0.945982905982906,
"calib/mu_c": 0.9835317460317461,
"calib/mu_w": 0.902175925925926,
"calib/nonempty_final_conf_rate": 0.9140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.41589743589743594,
"calib/std_conf": 0.21411639070286898,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4091853107344633,
"calib/step_q_c_n": 1593.0,
"calib/step_q_gap": -0.013271885449692655,
"calib/step_q_w": 0.42245719618415595,
"calib/step_q_w_n": 2411.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2806.0,
"completions/max_terminated_length": 2806.0,
"completions/mean_length": 723.90625,
"completions/mean_terminated_length": 785.2542114257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 274.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.6603429913520813,
"kl": 0.238128662109375,
"learning_rate": 2.5555555555555557e-06,
"loss": -0.112,
"mask/has_final_conf_rate": 0.9140625,
"mask/share_final_conf": 0.01936577633023262,
"mask/share_reasoning": 0.7345376014709473,
"mask/share_step_conf": 0.1679716408252716,
"num_tokens": 33646729.0,
"reward": 0.4862062335014343,
"reward_std": 0.20336128771305084,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.5301535129547119,
"rewards/format_reward_step": 0.9140625,
"rewards/step_margin_reward": 0.16100899875164032,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.48091477155685425,
"adv/mean_abs_reasoning": 0.40880656242370605,
"adv/mean_abs_step_conf": 0.711048424243927,
"adv/ratio_final_to_reasoning": 1.176387112539578,
"adv/ratio_step_to_reasoning": 1.7393273239752045,
"adv/std_final_conf": 0.7066504955291748,
"adv/std_reasoning": 0.681603729724884,
"adv/std_step_conf": 0.9066959023475647,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 12.2734375,
"calib/ece": 0.2816929133858269,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.937007874015748,
"calib/gap": 0.08334684684684668,
"calib/mean_conf": 0.9355511811023624,
"calib/mu_c": 0.9598333333333332,
"calib/mu_w": 0.8764864864864865,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25429133858267733,
"calib/std_conf": 0.2419586484050014,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4007450095057034,
"calib/step_q_c_n": 2104.0,
"calib/step_q_gap": -0.028012697623390925,
"calib/step_q_w": 0.42875770712909433,
"calib/step_q_w_n": 1038.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1923.0,
"completions/max_terminated_length": 1923.0,
"completions/mean_length": 725.7421875,
"completions/mean_terminated_length": 731.4566650390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.5370349884033203,
"kl": 0.266632080078125,
"learning_rate": 2.5277777777777778e-06,
"loss": -0.005,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.022184470668435097,
"mask/share_reasoning": 0.7960182428359985,
"mask/share_step_conf": 0.17398479580879211,
"num_tokens": 33937439.0,
"reward": 0.6677199602127075,
"reward_std": 0.2457650750875473,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7123234272003174,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.2840539813041687,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.45591533184051514,
"adv/mean_abs_reasoning": 0.48294445872306824,
"adv/mean_abs_step_conf": 0.6763150691986084,
"adv/ratio_final_to_reasoning": 0.9440326389622119,
"adv/ratio_step_to_reasoning": 1.400399273628323,
"adv/std_final_conf": 0.7039382457733154,
"adv/std_reasoning": 0.7393079400062561,
"adv/std_step_conf": 0.8756763339042664,
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 14.30078125,
"calib/ece": 0.24377551020408156,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9755102040816327,
"calib/gap": 0.054459721487749024,
"calib/mean_conf": 0.9743877551020408,
"calib/mu_c": 0.9881693989071039,
"calib/mu_w": 0.9337096774193548,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23561224489795912,
"calib/std_conf": 0.15442068386445038,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3812925341296929,
"calib/step_q_c_n": 2344.0,
"calib/step_q_gap": 0.0034103017834514193,
"calib/step_q_w": 0.37788223234624146,
"calib/step_q_w_n": 1317.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2239.0,
"completions/max_terminated_length": 2239.0,
"completions/mean_length": 751.015625,
"completions/mean_terminated_length": 784.7346801757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 269.0,
"epoch": 0.1184,
"grad_norm": 0.4570012390613556,
"kl": 0.243682861328125,
"learning_rate": 2.5e-06,
"loss": -0.0591,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.020198732614517212,
"mask/share_reasoning": 0.7622815370559692,
"mask/share_step_conf": 0.17455099523067474,
"num_tokens": 34237107.0,
"reward": 0.6729134321212769,
"reward_std": 0.25619810819625854,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7235041856765747,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.2879476547241211,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.5636082291603088,
"adv/mean_abs_reasoning": 0.42544421553611755,
"adv/mean_abs_step_conf": 0.5213595628738403,
"adv/ratio_final_to_reasoning": 1.324752361364429,
"adv/ratio_step_to_reasoning": 1.2254475295118454,
"adv/std_final_conf": 0.791305661201477,
"adv/std_reasoning": 0.7014791369438171,
"adv/std_step_conf": 0.7762733697891235,
"calib/answer_extract_rate": 0.953125,
"calib/avg_num_step_conf": 15.14453125,
"calib/ece": 0.34247950819672135,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9549180327868853,
"calib/gap": 0.023449449973808267,
"calib/mean_conf": 0.9535450819672132,
"calib/mu_c": 0.9615217391304348,
"calib/mu_w": 0.9380722891566265,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31809426229508203,
"calib/std_conf": 0.20625821895296445,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.33513364183248495,
"calib/step_q_c_n": 2161.0,
"calib/step_q_gap": -0.007010239286396158,
"calib/step_q_w": 0.3421438811188811,
"calib/step_q_w_n": 1716.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2151.0,
"completions/max_terminated_length": 2151.0,
"completions/mean_length": 809.3046875,
"completions/mean_terminated_length": 849.1065063476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 363.0,
"epoch": 0.11946666666666667,
"grad_norm": 1.2874290943145752,
"kl": 0.214996337890625,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.0949,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.017914071679115295,
"mask/share_reasoning": 0.7694779634475708,
"mask/share_step_conf": 0.16573293507099152,
"num_tokens": 34552209.0,
"reward": 0.5713348388671875,
"reward_std": 0.23633083701133728,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6264573335647583,
"rewards/format_reward_step": 0.953125,
"rewards/step_margin_reward": 0.19980597496032715,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.5868669152259827,
"adv/mean_abs_reasoning": 0.4290392994880676,
"adv/mean_abs_step_conf": 0.7190387845039368,
"adv/ratio_final_to_reasoning": 1.3678628412973728,
"adv/ratio_step_to_reasoning": 1.675927555731837,
"adv/std_final_conf": 0.7838603258132935,
"adv/std_reasoning": 0.701424777507782,
"adv/std_step_conf": 0.9065331816673279,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 13.671875,
"calib/ece": 0.29174206349206355,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9246031746031746,
"calib/gap": 0.19411820652173917,
"calib/mean_conf": 0.9266626984126984,
"calib/mu_c": 0.9975312500000001,
"calib/mu_w": 0.8034130434782609,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.29174206349206355,
"calib/std_conf": 0.25170808811888545,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3293279332351497,
"calib/step_q_c_n": 2037.0,
"calib/step_q_gap": 0.021470790378006865,
"calib/step_q_w": 0.30785714285714283,
"calib/step_q_w_n": 1463.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1638.0,
"completions/max_terminated_length": 1638.0,
"completions/mean_length": 742.3125,
"completions/mean_terminated_length": 754.0952758789062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 286.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.7477316856384277,
"kl": 0.2469482421875,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.0327,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.02127990871667862,
"mask/share_reasoning": 0.7831719517707825,
"mask/share_step_conf": 0.17992308735847473,
"num_tokens": 34847441.0,
"reward": 0.6346909999847412,
"reward_std": 0.2597951292991638,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.6986355781555176,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.24887137115001678,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.3718037009239197,
"adv/mean_abs_reasoning": 0.30481839179992676,
"adv/mean_abs_step_conf": 0.6702938675880432,
"adv/ratio_final_to_reasoning": 1.2197548144272081,
"adv/ratio_step_to_reasoning": 2.1989941736455427,
"adv/std_final_conf": 0.6294357776641846,
"adv/std_reasoning": 0.6184821128845215,
"adv/std_step_conf": 0.8755671977996826,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 14.2109375,
"calib/ece": 0.17660887096774197,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9395161290322581,
"calib/gap": 0.19603357522980136,
"calib/mean_conf": 0.9428185483870969,
"calib/mu_c": 0.9847128205128205,
"calib/mu_w": 0.7886792452830191,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1665685483870968,
"calib/std_conf": 0.22223807376359006,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3469958506224066,
"calib/step_q_c_n": 2410.0,
"calib/step_q_gap": 0.03685334247908417,
"calib/step_q_w": 0.31014250814332245,
"calib/step_q_w_n": 1228.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2654.0,
"completions/max_terminated_length": 2654.0,
"completions/mean_length": 763.73046875,
"completions/mean_terminated_length": 788.3668823242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 255.0,
"epoch": 0.1216,
"grad_norm": 0.5136744976043701,
"kl": 0.23712158203125,
"learning_rate": 2.4166666666666667e-06,
"loss": -0.0732,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.021277498453855515,
"mask/share_reasoning": 0.7698897123336792,
"mask/share_step_conf": 0.17758281528949738,
"num_tokens": 35147980.0,
"reward": 0.6923535466194153,
"reward_std": 0.19236746430397034,
"rewards/accuracy_reward_step": 0.76171875,
"rewards/final_brier_reward_step": 0.7982048988342285,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.24040833115577698,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.5842900276184082,
"adv/mean_abs_reasoning": 0.4125009775161743,
"adv/mean_abs_step_conf": 0.6591264009475708,
"adv/ratio_final_to_reasoning": 1.4164573163841727,
"adv/ratio_step_to_reasoning": 1.597878397565073,
"adv/std_final_conf": 0.7837476134300232,
"adv/std_reasoning": 0.6817185282707214,
"adv/std_step_conf": 0.8758804202079773,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 13.6640625,
"calib/ece": 0.3283157894736841,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.8623481781376519,
"calib/gap": 0.15629926296759833,
"calib/mean_conf": 0.867668016194332,
"calib/mu_c": 0.9271503267973855,
"calib/mu_w": 0.7708510638297872,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2882753036437246,
"calib/std_conf": 0.3242157949415586,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.33306884057971015,
"calib/step_q_c_n": 1932.0,
"calib/step_q_gap": 0.031195915930923468,
"calib/step_q_w": 0.3018729246487867,
"calib/step_q_w_n": 1566.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2148.0,
"completions/max_terminated_length": 2148.0,
"completions/mean_length": 722.2109375,
"completions/mean_terminated_length": 748.5263061523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 300.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.931440532207489,
"kl": 0.24176025390625,
"learning_rate": 2.388888888888889e-06,
"loss": -0.1057,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.020129762589931488,
"mask/share_reasoning": 0.7740639448165894,
"mask/share_step_conf": 0.17065003514289856,
"num_tokens": 35438130.0,
"reward": 0.564358115196228,
"reward_std": 0.24982187151908875,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6476209759712219,
"rewards/format_reward_step": 0.96484375,
"rewards/step_margin_reward": 0.16859526932239532,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.629609227180481,
"adv/mean_abs_reasoning": 0.4123613238334656,
"adv/mean_abs_step_conf": 0.6170235276222229,
"adv/ratio_final_to_reasoning": 1.526838698953135,
"adv/ratio_step_to_reasoning": 1.4963176514376777,
"adv/std_final_conf": 0.8164438009262085,
"adv/std_reasoning": 0.6817293167114258,
"adv/std_step_conf": 0.8592019081115723,
"calib/answer_extract_rate": 0.953125,
"calib/avg_num_step_conf": 16.59765625,
"calib/ece": 0.2614959016393441,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.8032786885245902,
"calib/gap": 0.25743358047574905,
"calib/mean_conf": 0.8067418032786884,
"calib/mu_c": 0.8890361445783131,
"calib/mu_w": 0.6316025641025641,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19395491803278672,
"calib/std_conf": 0.37868427706350427,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3237091684434968,
"calib/step_q_c_n": 2345.0,
"calib/step_q_gap": 0.02345181550232034,
"calib/step_q_w": 0.30025735294117645,
"calib/step_q_w_n": 1904.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2442.0,
"completions/max_terminated_length": 2442.0,
"completions/mean_length": 832.04296875,
"completions/mean_terminated_length": 872.9630737304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.12373333333333333,
"grad_norm": 1.2763618230819702,
"kl": 0.215667724609375,
"learning_rate": 2.361111111111111e-06,
"loss": -0.0845,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.018999746069312096,
"mask/share_reasoning": 0.766923725605011,
"mask/share_step_conf": 0.16720154881477356,
"num_tokens": 35755653.0,
"reward": 0.6194130778312683,
"reward_std": 0.2369731217622757,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7006518244743347,
"rewards/format_reward_step": 0.953125,
"rewards/step_margin_reward": 0.21786174178123474,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.7082833051681519,
"adv/mean_abs_reasoning": 0.48807835578918457,
"adv/mean_abs_step_conf": 0.6436492204666138,
"adv/ratio_final_to_reasoning": 1.4511672086399183,
"adv/ratio_step_to_reasoning": 1.3187415767000839,
"adv/std_final_conf": 0.8633624911308289,
"adv/std_reasoning": 0.7394745349884033,
"adv/std_step_conf": 0.8554308414459229,
"calib/answer_extract_rate": 0.9296875,
"calib/avg_num_step_conf": 16.33203125,
"calib/ece": 0.3534663865546218,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.5714285714285714,
"calib/gap": 0.21689822755860483,
"calib/mean_conf": 0.6089285714285715,
"calib/mu_c": 0.705530303030303,
"calib/mu_w": 0.4886320754716981,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20388655462184874,
"calib/std_conf": 0.46709724837101574,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3297563805104408,
"calib/step_q_c_n": 1724.0,
"calib/step_q_gap": 0.05719838295244328,
"calib/step_q_w": 0.27255799755799753,
"calib/step_q_w_n": 2457.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2486.0,
"completions/max_terminated_length": 2486.0,
"completions/mean_length": 753.875,
"completions/mean_terminated_length": 810.8908081054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 321.0,
"epoch": 0.1248,
"grad_norm": 2.708895683288574,
"kl": 0.22900390625,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.2058,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.018687278032302856,
"mask/share_reasoning": 0.7465159296989441,
"mask/share_step_conf": 0.16448429226875305,
"num_tokens": 36055245.0,
"reward": 0.5642485618591309,
"reward_std": 0.2435990869998932,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.5940789580345154,
"rewards/format_reward_step": 0.9296875,
"rewards/step_margin_reward": 0.24535568058490753,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.7272151708602905,
"adv/mean_abs_reasoning": 0.31207209825515747,
"adv/mean_abs_step_conf": 0.6416611075401306,
"adv/ratio_final_to_reasoning": 2.3302793646925215,
"adv/ratio_step_to_reasoning": 2.0561309746297582,
"adv/std_final_conf": 0.8775283694267273,
"adv/std_reasoning": 0.5962818264961243,
"adv/std_step_conf": 0.8595139384269714,
"calib/answer_extract_rate": 0.93359375,
"calib/avg_num_step_conf": 16.84375,
"calib/ece": 0.27910041841004174,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.5774058577405857,
"calib/gap": 0.2964238190286095,
"calib/mean_conf": 0.6315690376569039,
"calib/mu_c": 0.7208682634730539,
"calib/mu_w": 0.4244444444444444,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10596234309623422,
"calib/std_conf": 0.4492533862604019,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3522909090909091,
"calib/step_q_c_n": 2200.0,
"calib/step_q_gap": -0.018520643939393955,
"calib/step_q_w": 0.37081155303030305,
"calib/step_q_w_n": 2112.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2900.0,
"completions/max_terminated_length": 2900.0,
"completions/mean_length": 812.98046875,
"completions/mean_terminated_length": 867.17919921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 297.0,
"epoch": 0.12586666666666665,
"grad_norm": 4.6349310874938965,
"kl": 0.223388671875,
"learning_rate": 2.305555555555556e-06,
"loss": -0.197,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.01810162514448166,
"mask/share_reasoning": 0.7572929263114929,
"mask/share_step_conf": 0.16210542619228363,
"num_tokens": 36367376.0,
"reward": 0.5802870988845825,
"reward_std": 0.21968314051628113,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.6609405279159546,
"rewards/format_reward_step": 0.93359375,
"rewards/step_margin_reward": 0.18244624137878418,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.6874723434448242,
"adv/mean_abs_reasoning": 0.44178149104118347,
"adv/mean_abs_step_conf": 0.6230811476707458,
"adv/ratio_final_to_reasoning": 1.5561365910205982,
"adv/ratio_step_to_reasoning": 1.4103830973141909,
"adv/std_final_conf": 0.8441257476806641,
"adv/std_reasoning": 0.7208822965621948,
"adv/std_step_conf": 0.8435025811195374,
"calib/answer_extract_rate": 0.90625,
"calib/avg_num_step_conf": 18.06640625,
"calib/ece": 0.32571120689655175,
"calib/final_conf_rate": 0.90625,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.6681034482758621,
"calib/gap": 0.13268315018315013,
"calib/mean_conf": 0.7221767241379311,
"calib/mu_c": 0.7667857142857143,
"calib/mu_w": 0.6341025641025642,
"calib/nonempty_final_conf_rate": 0.90625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1920474137931035,
"calib/std_conf": 0.41018065697298056,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.32542656325739217,
"calib/step_q_c_n": 2063.0,
"calib/step_q_gap": -0.023979369607557033,
"calib/step_q_w": 0.3494059328649492,
"calib/step_q_w_n": 2562.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 2907.0,
"completions/max_terminated_length": 2907.0,
"completions/mean_length": 787.16015625,
"completions/mean_terminated_length": 864.8626708984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.12693333333333334,
"grad_norm": 1.767160177230835,
"kl": 0.221343994140625,
"learning_rate": 2.277777777777778e-06,
"loss": -0.2612,
"mask/has_final_conf_rate": 0.90625,
"mask/share_final_conf": 0.018292531371116638,
"mask/share_reasoning": 0.7322384715080261,
"mask/share_step_conf": 0.15962526202201843,
"num_tokens": 36673953.0,
"reward": 0.5670791864395142,
"reward_std": 0.2390528917312622,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6021067500114441,
"rewards/format_reward_step": 0.90625,
"rewards/step_margin_reward": 0.2304891049861908,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.546036958694458,
"adv/mean_abs_reasoning": 0.3437703847885132,
"adv/mean_abs_step_conf": 0.704839289188385,
"adv/ratio_final_to_reasoning": 1.5883769599012398,
"adv/ratio_step_to_reasoning": 2.050319982106663,
"adv/std_final_conf": 0.753393292427063,
"adv/std_reasoning": 0.6403025984764099,
"adv/std_step_conf": 0.9208722710609436,
"calib/answer_extract_rate": 0.953125,
"calib/avg_num_step_conf": 13.94140625,
"calib/ece": 0.18858775510204093,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.7346938775510204,
"calib/gap": 0.4150723881776512,
"calib/mean_conf": 0.7817306122448979,
"calib/mu_c": 0.9070994152046782,
"calib/mu_w": 0.492027027027027,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1361795918367348,
"calib/std_conf": 0.38059433768675144,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.34108988764044945,
"calib/step_q_c_n": 1958.0,
"calib/step_q_gap": -0.037941769715230333,
"calib/step_q_w": 0.3790316573556798,
"calib/step_q_w_n": 1611.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2375.0,
"completions/max_terminated_length": 2375.0,
"completions/mean_length": 725.33203125,
"completions/mean_terminated_length": 757.89794921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 262.0,
"epoch": 0.128,
"grad_norm": 1.3622678518295288,
"kl": 0.246917724609375,
"learning_rate": 2.25e-06,
"loss": -0.0768,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.020279962569475174,
"mask/share_reasoning": 0.7692380547523499,
"mask/share_step_conf": 0.16751320660114288,
"num_tokens": 36966326.0,
"reward": 0.6724545955657959,
"reward_std": 0.179477721452713,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7735120058059692,
"rewards/format_reward_step": 0.953125,
"rewards/step_margin_reward": 0.2471783459186554,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.6599460244178772,
"adv/mean_abs_reasoning": 0.5737216472625732,
"adv/mean_abs_step_conf": 0.6109805107116699,
"adv/ratio_final_to_reasoning": 1.1502895656224768,
"adv/ratio_step_to_reasoning": 1.0649424047826532,
"adv/std_final_conf": 0.8613766431808472,
"adv/std_reasoning": 0.8099164962768555,
"adv/std_step_conf": 0.8437104821205139,
"calib/answer_extract_rate": 0.94140625,
"calib/avg_num_step_conf": 15.734375,
"calib/ece": 0.2038589211618257,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.7510373443983402,
"calib/gap": 0.32272148257725175,
"calib/mean_conf": 0.7997095435684648,
"calib/mu_c": 0.8961242603550296,
"calib/mu_w": 0.5734027777777778,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15116182572614104,
"calib/std_conf": 0.3613657224261938,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3475085638998683,
"calib/step_q_c_n": 2277.0,
"calib/step_q_gap": 0.0032252971951282072,
"calib/step_q_w": 0.3442832667047401,
"calib/step_q_w_n": 1751.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2328.0,
"completions/max_terminated_length": 2328.0,
"completions/mean_length": 834.1171875,
"completions/mean_terminated_length": 886.0332641601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 265.0,
"epoch": 0.12906666666666666,
"grad_norm": 1.4408384561538696,
"kl": 0.223052978515625,
"learning_rate": 2.222222222222222e-06,
"loss": -0.1841,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.017796337604522705,
"mask/share_reasoning": 0.7621406316757202,
"mask/share_step_conf": 0.16146929562091827,
"num_tokens": 37284916.0,
"reward": 0.6500805616378784,
"reward_std": 0.2570773661136627,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7394177913665771,
"rewards/format_reward_step": 0.94140625,
"rewards/step_margin_reward": 0.2404308319091797,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.4568904638290405,
"adv/mean_abs_reasoning": 0.31823331117630005,
"adv/mean_abs_step_conf": 0.6336137652397156,
"adv/ratio_final_to_reasoning": 1.4357091095844612,
"adv/ratio_step_to_reasoning": 1.9910353284439666,
"adv/std_final_conf": 0.7229413986206055,
"adv/std_reasoning": 0.5960581302642822,
"adv/std_step_conf": 0.875344455242157,
"calib/answer_extract_rate": 0.93359375,
"calib/avg_num_step_conf": 15.18359375,
"calib/ece": 0.18924686192468615,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.8451882845188284,
"calib/gap": 0.35369532926225444,
"calib/mean_conf": 0.8679916317991632,
"calib/mu_c": 0.9760240963855422,
"calib/mu_w": 0.6223287671232878,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18133891213389117,
"calib/std_conf": 0.3185097752305585,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.38515778582514226,
"calib/step_q_c_n": 1933.0,
"calib/step_q_gap": -0.04652082215848108,
"calib/step_q_w": 0.43167860798362334,
"calib/step_q_w_n": 1954.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2535.0,
"completions/max_terminated_length": 2535.0,
"completions/mean_length": 786.87109375,
"completions/mean_terminated_length": 839.3292236328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 353.0,
"epoch": 0.13013333333333332,
"grad_norm": 1.0047415494918823,
"kl": 0.219940185546875,
"learning_rate": 2.1944444444444445e-06,
"loss": -0.1253,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.01762072555720806,
"mask/share_reasoning": 0.7645582556724548,
"mask/share_step_conf": 0.15532098710536957,
"num_tokens": 37593699.0,
"reward": 0.6291664838790894,
"reward_std": 0.19675558805465698,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.75284743309021,
"rewards/format_reward_step": 0.93359375,
"rewards/step_margin_reward": 0.18907909095287323,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.5483672618865967,
"adv/mean_abs_reasoning": 0.5403405427932739,
"adv/mean_abs_step_conf": 0.5864270925521851,
"adv/ratio_final_to_reasoning": 1.0148549265835742,
"adv/ratio_step_to_reasoning": 1.0852916746181365,
"adv/std_final_conf": 0.7761465311050415,
"adv/std_reasoning": 0.7929040193557739,
"adv/std_step_conf": 0.8109169602394104,
"calib/answer_extract_rate": 0.9453125,
"calib/avg_num_step_conf": 15.0859375,
"calib/ece": 0.2846694214876034,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.9049586776859504,
"calib/gap": 0.17493440667859284,
"calib/mean_conf": 0.9196280991735538,
"calib/mu_c": 0.9817948717948719,
"calib/mu_w": 0.8068604651162791,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27983471074380173,
"calib/std_conf": 0.2568951335149414,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3788781163434903,
"calib/step_q_c_n": 2166.0,
"calib/step_q_gap": -0.016869525165943755,
"calib/step_q_w": 0.39574764150943403,
"calib/step_q_w_n": 1696.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2971.0,
"completions/max_terminated_length": 2971.0,
"completions/mean_length": 852.58984375,
"completions/mean_terminated_length": 898.2015991210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 344.0,
"epoch": 0.1312,
"grad_norm": 1.0780717134475708,
"kl": 0.21270751953125,
"learning_rate": 2.166666666666667e-06,
"loss": -0.2034,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.017740830779075623,
"mask/share_reasoning": 0.7722517251968384,
"mask/share_step_conf": 0.1592261791229248,
"num_tokens": 37917250.0,
"reward": 0.5830578207969666,
"reward_std": 0.24974925816059113,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6706482172012329,
"rewards/format_reward_step": 0.9453125,
"rewards/step_margin_reward": 0.1845298558473587,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.4309426546096802,
"adv/mean_abs_reasoning": 0.44589105248451233,
"adv/mean_abs_step_conf": 0.6884101033210754,
"adv/ratio_final_to_reasoning": 0.9664752235068647,
"adv/ratio_step_to_reasoning": 1.5438975496037495,
"adv/std_final_conf": 0.703031063079834,
"adv/std_reasoning": 0.7206540703773499,
"adv/std_step_conf": 0.8913577198982239,
"calib/answer_extract_rate": 0.94140625,
"calib/avg_num_step_conf": 14.015625,
"calib/ece": 0.20929752066115706,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.8966942148760331,
"calib/gap": 0.15107758620689649,
"calib/mean_conf": 0.9200413223140496,
"calib/mu_c": 0.9562499999999999,
"calib/mu_w": 0.8051724137931034,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.184504132231405,
"calib/std_conf": 0.24728525692500625,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4224380762411347,
"calib/step_q_c_n": 2256.0,
"calib/step_q_gap": -0.04011072255766407,
"calib/step_q_w": 0.4625487987987988,
"calib/step_q_w_n": 1332.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2059.0,
"completions/max_terminated_length": 2059.0,
"completions/mean_length": 774.3203125,
"completions/mean_terminated_length": 819.1156616210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 276.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.6981167793273926,
"kl": 0.224945068359375,
"learning_rate": 2.138888888888889e-06,
"loss": -0.1673,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.018614403903484344,
"mask/share_reasoning": 0.7690895199775696,
"mask/share_step_conf": 0.1576085388660431,
"num_tokens": 38222292.0,
"reward": 0.669231116771698,
"reward_std": 0.23245593905448914,
"rewards/accuracy_reward_step": 0.71875,
"rewards/final_brier_reward_step": 0.7431816458702087,
"rewards/format_reward_step": 0.94140625,
"rewards/step_margin_reward": 0.26324930787086487,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.5283767580986023,
"adv/mean_abs_reasoning": 0.4329782724380493,
"adv/mean_abs_step_conf": 0.5721372365951538,
"adv/ratio_final_to_reasoning": 1.220330884326771,
"adv/ratio_step_to_reasoning": 1.3213994165885434,
"adv/std_final_conf": 0.776694655418396,
"adv/std_reasoning": 0.720791220664978,
"adv/std_step_conf": 0.8109546899795532,
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 13.53125,
"calib/ece": 0.28979591836734697,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.8367346938775511,
"calib/gap": 0.18335968081903042,
"calib/mean_conf": 0.8522448979591837,
"calib/mu_c": 0.9128658536585367,
"calib/mu_w": 0.7295061728395063,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23632653061224493,
"calib/std_conf": 0.3435865176735409,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41351314804917155,
"calib/step_q_c_n": 1871.0,
"calib/step_q_gap": -0.040548371097093305,
"calib/step_q_w": 0.45406151914626486,
"calib/step_q_w_n": 1593.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2764.0,
"completions/max_terminated_length": 2764.0,
"completions/mean_length": 767.30859375,
"completions/mean_terminated_length": 798.5,
"completions/min_length": 0.0,
"completions/min_terminated_length": 247.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.6414250731468201,
"kl": 0.235382080078125,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.1151,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.020531386137008667,
"mask/share_reasoning": 0.7824704647064209,
"mask/share_step_conf": 0.15793566405773163,
"num_tokens": 38523531.0,
"reward": 0.6279151439666748,
"reward_std": 0.24993808567523956,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6779242157936096,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.2583746910095215,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.5080631971359253,
"adv/mean_abs_reasoning": 0.4330710768699646,
"adv/mean_abs_step_conf": 0.5981600880622864,
"adv/ratio_final_to_reasoning": 1.1731635388998238,
"adv/ratio_step_to_reasoning": 1.3812053494440404,
"adv/std_final_conf": 0.7579245567321777,
"adv/std_reasoning": 0.7016152739524841,
"adv/std_step_conf": 0.8276919722557068,
"calib/answer_extract_rate": 0.9296875,
"calib/avg_num_step_conf": 14.2109375,
"calib/ece": 0.292983193277311,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.865546218487395,
"calib/gap": 0.1315063291139239,
"calib/mean_conf": 0.9013025210084034,
"calib/mu_c": 0.945506329113924,
"calib/mu_w": 0.8140000000000001,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2652100840336135,
"calib/std_conf": 0.26951946203161764,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40487768969422416,
"calib/step_q_c_n": 1766.0,
"calib/step_q_gap": 0.024452476019010516,
"calib/step_q_w": 0.38042521367521365,
"calib/step_q_w_n": 1872.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2527.0,
"completions/max_terminated_length": 2527.0,
"completions/mean_length": 739.34375,
"completions/mean_terminated_length": 795.2605590820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 266.0,
"epoch": 0.1344,
"grad_norm": 0.9234392642974854,
"kl": 0.234893798828125,
"learning_rate": 2.0833333333333334e-06,
"loss": -0.2053,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.020162848755717278,
"mask/share_reasoning": 0.7513543367385864,
"mask/share_step_conf": 0.15817034244537354,
"num_tokens": 38818267.0,
"reward": 0.5585942268371582,
"reward_std": 0.23993246257305145,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.6568480730056763,
"rewards/format_reward_step": 0.9296875,
"rewards/step_margin_reward": 0.1509653478860855,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.45118528604507446,
"adv/mean_abs_reasoning": 0.4313002824783325,
"adv/mean_abs_step_conf": 0.6212649345397949,
"adv/ratio_final_to_reasoning": 1.0461047775171373,
"adv/ratio_step_to_reasoning": 1.440446389160447,
"adv/std_final_conf": 0.7026150822639465,
"adv/std_reasoning": 0.7016158699989319,
"adv/std_step_conf": 0.8277674317359924,
"calib/answer_extract_rate": 0.92578125,
"calib/avg_num_step_conf": 13.5390625,
"calib/ece": 0.31177215189873425,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.8860759493670886,
"calib/gap": 0.12139142407553105,
"calib/mean_conf": 0.8983544303797468,
"calib/mu_c": 0.9403548387096774,
"calib/mu_w": 0.8189634146341463,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27805907172995786,
"calib/std_conf": 0.290054616040288,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.446405503057254,
"calib/step_q_c_n": 1799.0,
"calib/step_q_gap": 0.031089966164632554,
"calib/step_q_w": 0.41531553689262146,
"calib/step_q_w_n": 1667.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 3035.0,
"completions/max_terminated_length": 3035.0,
"completions/mean_length": 690.59765625,
"completions/mean_terminated_length": 742.8277587890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 238.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.7550332546234131,
"kl": 0.249603271484375,
"learning_rate": 2.0555555555555555e-06,
"loss": -0.1629,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.02026120387017727,
"mask/share_reasoning": 0.7483981847763062,
"mask/share_step_conf": 0.16102807223796844,
"num_tokens": 39098732.0,
"reward": 0.5520787239074707,
"reward_std": 0.24869316816329956,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.6339927911758423,
"rewards/format_reward_step": 0.92578125,
"rewards/step_margin_reward": 0.16391471028327942,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.5826007127761841,
"adv/mean_abs_reasoning": 0.4872943162918091,
"adv/mean_abs_step_conf": 0.5902491807937622,
"adv/ratio_final_to_reasoning": 1.195582819864663,
"adv/ratio_step_to_reasoning": 1.2112786073217816,
"adv/std_final_conf": 0.8118354678153992,
"adv/std_reasoning": 0.7394742369651794,
"adv/std_step_conf": 0.827497124671936,
"calib/answer_extract_rate": 0.93359375,
"calib/avg_num_step_conf": 14.203125,
"calib/ece": 0.29087866108786625,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.8702928870292888,
"calib/gap": 0.2160395010395012,
"calib/mean_conf": 0.9030125523012552,
"calib/mu_c": 0.9852702702702703,
"calib/mu_w": 0.769230769230769,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.28732217573221774,
"calib/std_conf": 0.2635021691521325,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42852554961378486,
"calib/step_q_c_n": 1683.0,
"calib/step_q_gap": 0.05191356804696462,
"calib/step_q_w": 0.37661198156682024,
"calib/step_q_w_n": 1953.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2669.0,
"completions/max_terminated_length": 2669.0,
"completions/mean_length": 730.25390625,
"completions/mean_terminated_length": 782.1966552734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 237.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.9753623604774475,
"kl": 0.23822021484375,
"learning_rate": 2.027777777777778e-06,
"loss": -0.2389,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.020317887887358665,
"mask/share_reasoning": 0.7514255046844482,
"mask/share_step_conf": 0.16185034811496735,
"num_tokens": 39392341.0,
"reward": 0.5939993262290955,
"reward_std": 0.26951467990875244,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6685829758644104,
"rewards/format_reward_step": 0.93359375,
"rewards/step_margin_reward": 0.2170717716217041,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.4442763328552246,
"adv/mean_abs_reasoning": 0.45170193910598755,
"adv/mean_abs_step_conf": 0.6424069404602051,
"adv/ratio_final_to_reasoning": 0.9835608271563772,
"adv/ratio_step_to_reasoning": 1.422192124593626,
"adv/std_final_conf": 0.7024354934692383,
"adv/std_reasoning": 0.7207036018371582,
"adv/std_step_conf": 0.8758267164230347,
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 13.2734375,
"calib/ece": 0.24002857142857154,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.7918367346938775,
"calib/gap": 0.2521655696943106,
"calib/mean_conf": 0.8419061224489796,
"calib/mu_c": 0.9304213836477988,
"calib/mu_w": 0.6782558139534882,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21647755102040828,
"calib/std_conf": 0.32542536584910803,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4621792763157895,
"calib/step_q_c_n": 1824.0,
"calib/step_q_gap": 0.05840926360930926,
"calib/step_q_w": 0.4037700127064802,
"calib/step_q_w_n": 1574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2338.0,
"completions/max_terminated_length": 2338.0,
"completions/mean_length": 692.5078125,
"completions/mean_terminated_length": 723.5999755859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 283.0,
"epoch": 0.1376,
"grad_norm": 0.7908511757850647,
"kl": 0.26397705078125,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.1356,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.022384271025657654,
"mask/share_reasoning": 0.7689116597175598,
"mask/share_step_conf": 0.16573531925678253,
"num_tokens": 39672007.0,
"reward": 0.6568441390991211,
"reward_std": 0.1993679404258728,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7119947075843811,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.2860685884952545,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.3676624596118927,
"adv/mean_abs_reasoning": 0.19307512044906616,
"adv/mean_abs_step_conf": 0.5610698461532593,
"adv/ratio_final_to_reasoning": 1.9042456571139794,
"adv/ratio_step_to_reasoning": 2.905966573260646,
"adv/std_final_conf": 0.6428574323654175,
"adv/std_reasoning": 0.495947003364563,
"adv/std_step_conf": 0.8103545308113098,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 12.69921875,
"calib/ece": 0.14574596774193543,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.8064516129032258,
"calib/gap": 0.38447490347490376,
"calib/mean_conf": 0.8525201612903226,
"calib/mu_c": 0.9501891891891894,
"calib/mu_w": 0.5657142857142856,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12614919354838705,
"calib/std_conf": 0.31570038559445907,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4415043301069791,
"calib/step_q_c_n": 1963.0,
"calib/step_q_gap": 0.04577063445480517,
"calib/step_q_w": 0.3957336956521739,
"calib/step_q_w_n": 1288.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2341.0,
"completions/max_terminated_length": 2341.0,
"completions/mean_length": 663.4609375,
"completions/mean_terminated_length": 684.8628540039062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 316.0,
"epoch": 0.13866666666666666,
"grad_norm": 1.1481865644454956,
"kl": 0.264801025390625,
"learning_rate": 1.9722222222222224e-06,
"loss": -0.1471,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.02265078015625477,
"mask/share_reasoning": 0.7670093774795532,
"mask/share_step_conf": 0.17908982932567596,
"num_tokens": 39947141.0,
"reward": 0.7057443857192993,
"reward_std": 0.14136728644371033,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.8187835216522217,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.25442397594451904,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.6045816540718079,
"adv/mean_abs_reasoning": 0.42527127265930176,
"adv/mean_abs_step_conf": 0.5815385580062866,
"adv/ratio_final_to_reasoning": 1.4216376532824433,
"adv/ratio_step_to_reasoning": 1.3674531890428807,
"adv/std_final_conf": 0.7785736918449402,
"adv/std_reasoning": 0.7207863926887512,
"adv/std_step_conf": 0.8273931741714478,
"calib/answer_extract_rate": 0.90625,
"calib/avg_num_step_conf": 15.6015625,
"calib/ece": 0.15808189655172422,
"calib/final_conf_rate": 0.90625,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.4827586206896552,
"calib/gap": 0.5209051724137934,
"calib/mean_conf": 0.5738577586206897,
"calib/mu_c": 0.8343103448275864,
"calib/mu_w": 0.313405172413793,
"calib/nonempty_final_conf_rate": 0.90625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11596982758620697,
"calib/std_conf": 0.4387053460233405,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.38929024583663757,
"calib/step_q_c_n": 1261.0,
"calib/step_q_gap": 0.10923810533169798,
"calib/step_q_w": 0.2800521405049396,
"calib/step_q_w_n": 2733.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 3058.0,
"completions/max_terminated_length": 3058.0,
"completions/mean_length": 698.453125,
"completions/mean_terminated_length": 764.1196899414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 285.0,
"epoch": 0.13973333333333332,
"grad_norm": 1.5745195150375366,
"kl": 0.25299072265625,
"learning_rate": 1.944444444444445e-06,
"loss": -0.2455,
"mask/has_final_conf_rate": 0.90625,
"mask/share_final_conf": 0.0197146013379097,
"mask/share_reasoning": 0.7379894256591797,
"mask/share_step_conf": 0.15635845065116882,
"num_tokens": 40232153.0,
"reward": 0.6144878268241882,
"reward_std": 0.19122637808322906,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.7363600730895996,
"rewards/format_reward_step": 0.90625,
"rewards/step_margin_reward": 0.22074052691459656,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.580619215965271,
"adv/mean_abs_reasoning": 0.43576955795288086,
"adv/mean_abs_step_conf": 0.606571614742279,
"adv/ratio_final_to_reasoning": 1.332399671727534,
"adv/ratio_step_to_reasoning": 1.3919549993160991,
"adv/std_final_conf": 0.8146044015884399,
"adv/std_reasoning": 0.6819318532943726,
"adv/std_step_conf": 0.8462624549865723,
"calib/answer_extract_rate": 0.8984375,
"calib/avg_num_step_conf": 15.38671875,
"calib/ece": 0.2710217391304348,
"calib/final_conf_rate": 0.8984375,
"calib/format_rate": 0.8984375,
"calib/frac_conf_gt_0.9": 0.44782608695652176,
"calib/gap": 0.573583509513742,
"calib/mean_conf": 0.5486739130434782,
"calib/mu_c": 0.6559090909090909,
"calib/mu_w": 0.08232558139534883,
"calib/nonempty_final_conf_rate": 0.8984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.003326086956521741,
"calib/std_conf": 0.4409134320213686,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.35733181818181814,
"calib/step_q_c_n": 2200.0,
"calib/step_q_gap": 0.10112014480631498,
"calib/step_q_w": 0.25621167337550316,
"calib/step_q_w_n": 1739.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 1797.0,
"completions/max_terminated_length": 1797.0,
"completions/mean_length": 664.88671875,
"completions/mean_terminated_length": 740.0477905273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 260.0,
"epoch": 0.1408,
"grad_norm": 2.7517290115356445,
"kl": 0.253387451171875,
"learning_rate": 1.916666666666667e-06,
"loss": -0.2744,
"mask/has_final_conf_rate": 0.8984375,
"mask/share_final_conf": 0.020201554521918297,
"mask/share_reasoning": 0.7178224325180054,
"mask/share_step_conf": 0.16041353344917297,
"num_tokens": 40507956.0,
"reward": 0.6151009798049927,
"reward_std": 0.19379822909832,
"rewards/accuracy_reward_step": 0.73046875,
"rewards/final_brier_reward_step": 0.6810821294784546,
"rewards/format_reward_step": 0.8984375,
"rewards/step_margin_reward": 0.22333842515945435,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.8137380480766296,
"adv/mean_abs_reasoning": 0.7138146162033081,
"adv/mean_abs_step_conf": 0.6698095202445984,
"adv/ratio_final_to_reasoning": 1.1399851300395079,
"adv/ratio_step_to_reasoning": 0.9383522066376738,
"adv/std_final_conf": 0.9363965392112732,
"adv/std_reasoning": 0.8907052278518677,
"adv/std_step_conf": 0.8721390962600708,
"calib/answer_extract_rate": 0.8125,
"calib/avg_num_step_conf": 20.48046875,
"calib/ece": 0.33415865384615384,
"calib/final_conf_rate": 0.8125,
"calib/format_rate": 0.8125,
"calib/frac_conf_gt_0.9": 0.17307692307692307,
"calib/gap": 0.26145075757575753,
"calib/mean_conf": 0.2786778846153846,
"calib/mu_c": 0.38929166666666665,
"calib/mu_w": 0.1278409090909091,
"calib/nonempty_final_conf_rate": 0.8125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.01795673076923077,
"calib/std_conf": 0.3670671274431576,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3245042345276873,
"calib/step_q_c_n": 1535.0,
"calib/step_q_gap": 0.13698603064419215,
"calib/step_q_w": 0.18751820388349513,
"calib/step_q_w_n": 3708.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 1972.0,
"completions/max_terminated_length": 1972.0,
"completions/mean_length": 703.8203125,
"completions/mean_terminated_length": 866.2404174804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.14186666666666667,
"grad_norm": 7.1175689697265625,
"kl": 0.2830810546875,
"learning_rate": 1.888888888888889e-06,
"loss": -0.5345,
"mask/has_final_conf_rate": 0.8125,
"mask/share_final_conf": 0.015356351621448994,
"mask/share_reasoning": 0.6622164249420166,
"mask/share_step_conf": 0.13492724299430847,
"num_tokens": 40794478.0,
"reward": 0.516724705696106,
"reward_std": 0.2438364326953888,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.536136269569397,
"rewards/format_reward_step": 0.8125,
"rewards/step_margin_reward": 0.2410632073879242,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.7017267942428589,
"adv/mean_abs_reasoning": 0.5476149320602417,
"adv/mean_abs_step_conf": 0.6388225555419922,
"adv/ratio_final_to_reasoning": 1.281423776380268,
"adv/ratio_step_to_reasoning": 1.166554303292294,
"adv/std_final_conf": 0.8980323672294617,
"adv/std_reasoning": 0.8100696802139282,
"adv/std_step_conf": 0.8600804209709167,
"calib/answer_extract_rate": 0.88671875,
"calib/avg_num_step_conf": 17.6953125,
"calib/ece": 0.36605726872246686,
"calib/final_conf_rate": 0.88671875,
"calib/format_rate": 0.88671875,
"calib/frac_conf_gt_0.9": 0.2026431718061674,
"calib/gap": 0.3028257463301996,
"calib/mean_conf": 0.3038546255506608,
"calib/mu_c": 0.418581560283688,
"calib/mu_w": 0.11575581395348837,
"calib/nonempty_final_conf_rate": 0.88671875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.024383259911894245,
"calib/std_conf": 0.3899412708669093,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.2956536694677871,
"calib/step_q_c_n": 1785.0,
"calib/step_q_gap": 0.08443665671733172,
"calib/step_q_w": 0.21121701275045537,
"calib/step_q_w_n": 2745.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 3070.0,
"completions/max_terminated_length": 3070.0,
"completions/mean_length": 770.0234375,
"completions/mean_terminated_length": 860.812255859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 270.0,
"epoch": 0.14293333333333333,
"grad_norm": 3.8303635120391846,
"kl": 0.224609375,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.2861,
"mask/has_final_conf_rate": 0.88671875,
"mask/share_final_conf": 0.017552226781845093,
"mask/share_reasoning": 0.7260377407073975,
"mask/share_step_conf": 0.15094125270843506,
"num_tokens": 41100556.0,
"reward": 0.51554274559021,
"reward_std": 0.19827260076999664,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.5803333520889282,
"rewards/format_reward_step": 0.88671875,
"rewards/step_margin_reward": 0.16325227916240692,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.7464656829833984,
"adv/mean_abs_reasoning": 0.5359787940979004,
"adv/mean_abs_step_conf": 0.6636673808097839,
"adv/ratio_final_to_reasoning": 1.3927149566425778,
"adv/ratio_step_to_reasoning": 1.2382344005359291,
"adv/std_final_conf": 0.9189602136611938,
"adv/std_reasoning": 0.7931401133537292,
"adv/std_step_conf": 0.8599939346313477,
"calib/answer_extract_rate": 0.9140625,
"calib/avg_num_step_conf": 16.0703125,
"calib/ece": 0.36658119658119653,
"calib/final_conf_rate": 0.9140625,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 0.2264957264957265,
"calib/gap": 0.21275192554557118,
"calib/mean_conf": 0.38722222222222225,
"calib/mu_c": 0.46177631578947365,
"calib/mu_w": 0.24902439024390247,
"calib/nonempty_final_conf_rate": 0.9140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0521153846153846,
"calib/std_conf": 0.38707507412703074,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.2987839958158996,
"calib/step_q_c_n": 1912.0,
"calib/step_q_gap": 0.08667818291853355,
"calib/step_q_w": 0.21210581289736605,
"calib/step_q_w_n": 2202.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 2091.0,
"completions/max_terminated_length": 2091.0,
"completions/mean_length": 716.93359375,
"completions/mean_terminated_length": 784.337646484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 352.0,
"epoch": 0.144,
"grad_norm": 4.714707851409912,
"kl": 0.2374267578125,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.3072,
"mask/has_final_conf_rate": 0.9140625,
"mask/share_final_conf": 0.01891816221177578,
"mask/share_reasoning": 0.7368408441543579,
"mask/share_step_conf": 0.15830349922180176,
"num_tokens": 41389971.0,
"reward": 0.6003228425979614,
"reward_std": 0.2147035449743271,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.5946650505065918,
"rewards/format_reward_step": 0.9140625,
"rewards/step_margin_reward": 0.3044181168079376,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.6705352067947388,
"adv/mean_abs_reasoning": 0.5252009034156799,
"adv/mean_abs_step_conf": 0.5619533658027649,
"adv/ratio_final_to_reasoning": 1.27672135069431,
"adv/ratio_step_to_reasoning": 1.0699779115916648,
"adv/std_final_conf": 0.8491891026496887,
"adv/std_reasoning": 0.7930968999862671,
"adv/std_step_conf": 0.8269990086555481,
"calib/answer_extract_rate": 0.8515625,
"calib/avg_num_step_conf": 17.96484375,
"calib/ece": 0.14999082568807334,
"calib/final_conf_rate": 0.8515625,
"calib/format_rate": 0.8515625,
"calib/frac_conf_gt_0.9": 0.42201834862385323,
"calib/gap": 0.5577160548429898,
"calib/mean_conf": 0.5046697247706422,
"calib/mu_c": 0.7221278195488722,
"calib/mu_w": 0.1644117647058824,
"calib/nonempty_final_conf_rate": 0.8515625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.022284403669724735,
"calib/std_conf": 0.4448066800997846,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.33198000000000005,
"calib/step_q_c_n": 1485.0,
"calib/step_q_gap": 0.1005222286448299,
"calib/step_q_w": 0.23145777135517015,
"calib/step_q_w_n": 3114.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1484375,
"completions/max_length": 2616.0,
"completions/max_terminated_length": 2616.0,
"completions/mean_length": 652.3203125,
"completions/mean_terminated_length": 766.0274658203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 228.0,
"epoch": 0.14506666666666668,
"grad_norm": 1.8986977338790894,
"kl": 0.254119873046875,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.4585,
"mask/has_final_conf_rate": 0.8515625,
"mask/share_final_conf": 0.018733292818069458,
"mask/share_reasoning": 0.6779872179031372,
"mask/share_step_conf": 0.15484192967414856,
"num_tokens": 41665453.0,
"reward": 0.583008885383606,
"reward_std": 0.22099415957927704,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6969972848892212,
"rewards/format_reward_step": 0.8515625,
"rewards/step_margin_reward": 0.19480186700820923,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.6077395677566528,
"adv/mean_abs_reasoning": 0.3589785397052765,
"adv/mean_abs_step_conf": 0.52858567237854,
"adv/ratio_final_to_reasoning": 1.6929690790307705,
"adv/ratio_step_to_reasoning": 1.4724715098916832,
"adv/std_final_conf": 0.8171355724334717,
"adv/std_reasoning": 0.6407114863395691,
"adv/std_step_conf": 0.776058554649353,
"calib/answer_extract_rate": 0.87890625,
"calib/avg_num_step_conf": 17.68359375,
"calib/ece": 0.20969333333333343,
"calib/final_conf_rate": 0.87890625,
"calib/format_rate": 0.87890625,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.3545913728297836,
"calib/mean_conf": 0.7405377777777777,
"calib/mu_c": 0.8571589403973511,
"calib/mu_w": 0.5025675675675675,
"calib/nonempty_final_conf_rate": 0.87890625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1395600000000001,
"calib/std_conf": 0.384513475729114,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3302354227405248,
"calib/step_q_c_n": 2058.0,
"calib/step_q_gap": 0.061405127074263155,
"calib/step_q_w": 0.26883029566626165,
"calib/step_q_w_n": 2469.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.12109375,
"completions/max_length": 2928.0,
"completions/max_terminated_length": 2928.0,
"completions/mean_length": 720.8125,
"completions/mean_terminated_length": 820.1244506835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 313.0,
"epoch": 0.14613333333333334,
"grad_norm": 2.3515427112579346,
"kl": 0.222381591796875,
"learning_rate": 1.777777777777778e-06,
"loss": -0.3275,
"mask/has_final_conf_rate": 0.87890625,
"mask/share_final_conf": 0.01775544509291649,
"mask/share_reasoning": 0.7089648246765137,
"mask/share_step_conf": 0.15218594670295715,
"num_tokens": 41956965.0,
"reward": 0.6170933246612549,
"reward_std": 0.19505244493484497,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6883065700531006,
"rewards/format_reward_step": 0.87890625,
"rewards/step_margin_reward": 0.25213009119033813,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.5493025779724121,
"adv/mean_abs_reasoning": 0.35511231422424316,
"adv/mean_abs_step_conf": 0.632819652557373,
"adv/ratio_final_to_reasoning": 1.5468418186859705,
"adv/ratio_step_to_reasoning": 1.7820267763448094,
"adv/std_final_conf": 0.7381003499031067,
"adv/std_reasoning": 0.6405137181282043,
"adv/std_step_conf": 0.843903660774231,
"calib/answer_extract_rate": 0.94140625,
"calib/avg_num_step_conf": 14.47265625,
"calib/ece": 0.2170791666666667,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.8041666666666667,
"calib/gap": 0.2905591830038552,
"calib/mean_conf": 0.8499958333333333,
"calib/mu_c": 0.9383742514970059,
"calib/mu_w": 0.6478150684931507,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1856208333333334,
"calib/std_conf": 0.3142465459624108,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.31700620404411767,
"calib/step_q_c_n": 2176.0,
"calib/step_q_gap": -0.056702102038289126,
"calib/step_q_w": 0.3737083060824068,
"calib/step_q_w_n": 1529.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2955.0,
"completions/max_terminated_length": 2955.0,
"completions/mean_length": 724.83984375,
"completions/mean_terminated_length": 769.9544067382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.1472,
"grad_norm": 1.2674281597137451,
"kl": 0.250762939453125,
"learning_rate": 1.75e-06,
"loss": -0.1559,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.021356452256441116,
"mask/share_reasoning": 0.7491965889930725,
"mask/share_step_conf": 0.17085321247577667,
"num_tokens": 42246860.0,
"reward": 0.6420503854751587,
"reward_std": 0.2165650725364685,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.739525318145752,
"rewards/format_reward_step": 0.9375,
"rewards/step_margin_reward": 0.22582541406154633,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.5015965104103088,
"adv/mean_abs_reasoning": 0.2608618140220642,
"adv/mean_abs_step_conf": 0.5609601140022278,
"adv/ratio_final_to_reasoning": 1.922843756533422,
"adv/ratio_step_to_reasoning": 2.150410998655329,
"adv/std_final_conf": 0.7413507103919983,
"adv/std_reasoning": 0.5483863949775696,
"adv/std_step_conf": 0.8105617165565491,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 12.140625,
"calib/ece": 0.30063399999999985,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.916,
"calib/gap": 0.020221792658208892,
"calib/mean_conf": 0.9404739999999999,
"calib/mu_c": 0.9467023121387284,
"calib/mu_w": 0.9264805194805195,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27455399999999985,
"calib/std_conf": 0.1952396484426255,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.31638177863702277,
"calib/step_q_c_n": 2069.0,
"calib/step_q_gap": -0.008583572662303529,
"calib/step_q_w": 0.3249653512993263,
"calib/step_q_w_n": 1039.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2209.0,
"completions/max_terminated_length": 2209.0,
"completions/mean_length": 686.16796875,
"completions/mean_terminated_length": 702.6360473632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 268.0,
"epoch": 0.14826666666666666,
"grad_norm": 1.2241623401641846,
"kl": 0.256683349609375,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.0701,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.02256494015455246,
"mask/share_reasoning": 0.7774582505226135,
"mask/share_step_conf": 0.1765393167734146,
"num_tokens": 42525615.0,
"reward": 0.6329671144485474,
"reward_std": 0.17554357647895813,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.679322361946106,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.2561429738998413,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.46179768443107605,
"adv/mean_abs_reasoning": 0.32739922404289246,
"adv/mean_abs_step_conf": 0.6492865681648254,
"adv/ratio_final_to_reasoning": 1.41050329542191,
"adv/ratio_step_to_reasoning": 1.983164651849519,
"adv/std_final_conf": 0.7165549397468567,
"adv/std_reasoning": 0.6611654162406921,
"adv/std_step_conf": 0.8758783936500549,
"calib/answer_extract_rate": 0.97265625,
"calib/avg_num_step_conf": 14.390625,
"calib/ece": 0.25146485943775093,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9156626506024096,
"calib/gap": 0.13560537931034478,
"calib/mean_conf": 0.9435130522088354,
"calib/mu_c": 0.9843580459770115,
"calib/mu_w": 0.8487526666666667,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24809136546184732,
"calib/std_conf": 0.18642138216210663,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.335719409465914,
"calib/step_q_c_n": 2303.0,
"calib/step_q_gap": 0.02131028564259757,
"calib/step_q_w": 0.3144091238233164,
"calib/step_q_w_n": 1381.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2577.0,
"completions/max_terminated_length": 2577.0,
"completions/mean_length": 778.75,
"completions/mean_terminated_length": 800.6425170898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 273.0,
"epoch": 0.14933333333333335,
"grad_norm": 1.2806999683380127,
"kl": 0.2357177734375,
"learning_rate": 1.6944444444444446e-06,
"loss": -0.0411,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.020671725273132324,
"mask/share_reasoning": 0.7754533290863037,
"mask/share_step_conf": 0.17653116583824158,
"num_tokens": 42829991.0,
"reward": 0.6468392610549927,
"reward_std": 0.19622653722763062,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7314028143882751,
"rewards/format_reward_step": 0.97265625,
"rewards/step_margin_reward": 0.2318069040775299,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.525245189666748,
"adv/mean_abs_reasoning": 0.33174651861190796,
"adv/mean_abs_step_conf": 0.5822703838348389,
"adv/ratio_final_to_reasoning": 1.5832726500476213,
"adv/ratio_step_to_reasoning": 1.7551665237397864,
"adv/std_final_conf": 0.7608556151390076,
"adv/std_reasoning": 0.6402722597122192,
"adv/std_step_conf": 0.8274043798446655,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 14.44921875,
"calib/ece": 0.23398999999999998,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.896,
"calib/gap": 0.1750773809523808,
"calib/mean_conf": 0.91907,
"calib/mu_c": 0.9680916666666666,
"calib/mu_w": 0.7930142857142858,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21652999999999997,
"calib/std_conf": 0.2410115476486552,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.32720863007959783,
"calib/step_q_c_n": 2387.0,
"calib/step_q_gap": -0.04450909095698752,
"calib/step_q_w": 0.37171772103658535,
"calib/step_q_w_n": 1312.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2345.0,
"completions/max_terminated_length": 2345.0,
"completions/mean_length": 814.62109375,
"completions/mean_terminated_length": 834.1720581054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 353.0,
"epoch": 0.1504,
"grad_norm": 0.9242333173751831,
"kl": 0.212890625,
"learning_rate": 1.6666666666666667e-06,
"loss": -0.0614,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.01946493610739708,
"mask/share_reasoning": 0.7856360077857971,
"mask/share_step_conf": 0.17146152257919312,
"num_tokens": 43145630.0,
"reward": 0.6645768880844116,
"reward_std": 0.20759084820747375,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7531989812850952,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.24001729488372803,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.5690226554870605,
"adv/mean_abs_reasoning": 0.42950016260147095,
"adv/mean_abs_step_conf": 0.6370354294776917,
"adv/ratio_final_to_reasoning": 1.3248485216874089,
"adv/ratio_step_to_reasoning": 1.4832018354060337,
"adv/std_final_conf": 0.7847913503646851,
"adv/std_reasoning": 0.7015143036842346,
"adv/std_step_conf": 0.8598779439926147,
"calib/answer_extract_rate": 0.9375,
"calib/avg_num_step_conf": 16.06640625,
"calib/ece": 0.2856208333333334,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.8708333333333333,
"calib/gap": 0.18633911483253596,
"calib/mean_conf": 0.9042875,
"calib/mu_c": 0.9726118421052633,
"calib/mu_w": 0.7862727272727273,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2782875000000001,
"calib/std_conf": 0.25235399378072726,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.36300235109717865,
"calib/step_q_c_n": 1914.0,
"calib/step_q_gap": -0.036242441990588514,
"calib/step_q_w": 0.39924479308776717,
"calib/step_q_w_n": 2199.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2697.0,
"completions/max_terminated_length": 2697.0,
"completions/mean_length": 833.3125,
"completions/mean_terminated_length": 881.5206298828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 258.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.8023488521575928,
"kl": 0.2142333984375,
"learning_rate": 1.638888888888889e-06,
"loss": -0.1356,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.019096028059720993,
"mask/share_reasoning": 0.7625699043273926,
"mask/share_step_conf": 0.16364656388759613,
"num_tokens": 43464118.0,
"reward": 0.6316433548927307,
"reward_std": 0.24024856090545654,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6723967790603638,
"rewards/format_reward_step": 0.9375,
"rewards/step_margin_reward": 0.284639835357666,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.5092817544937134,
"adv/mean_abs_reasoning": 0.3617568016052246,
"adv/mean_abs_step_conf": 0.6399293541908264,
"adv/ratio_final_to_reasoning": 1.407801462844308,
"adv/ratio_step_to_reasoning": 1.7689490601179185,
"adv/std_final_conf": 0.7648191452026367,
"adv/std_reasoning": 0.6815710663795471,
"adv/std_step_conf": 0.8593825697898865,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 14.4375,
"calib/ece": 0.24520800000000006,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.768,
"calib/gap": 0.22287359708193044,
"calib/mean_conf": 0.8428880000000001,
"calib/mu_c": 0.9213395061728396,
"calib/mu_w": 0.6984659090909091,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22004800000000008,
"calib/std_conf": 0.3032960425986465,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.34825138818778395,
"calib/step_q_c_n": 1981.0,
"calib/step_q_gap": 0.010504449412273742,
"calib/step_q_w": 0.3377469387755102,
"calib/step_q_w_n": 1715.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2523.0,
"completions/max_terminated_length": 2523.0,
"completions/mean_length": 805.5546875,
"completions/mean_terminated_length": 824.8880615234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 315.0,
"epoch": 0.15253333333333333,
"grad_norm": 1.2207856178283691,
"kl": 0.22381591796875,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.0968,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.019887078553438187,
"mask/share_reasoning": 0.7877940535545349,
"mask/share_step_conf": 0.168881356716156,
"num_tokens": 43777676.0,
"reward": 0.6582680940628052,
"reward_std": 0.20583592355251312,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7261790037155151,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.26848214864730835,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.47069185972213745,
"adv/mean_abs_reasoning": 0.3482842445373535,
"adv/mean_abs_step_conf": 0.6167457103729248,
"adv/ratio_final_to_reasoning": 1.351458950856032,
"adv/ratio_step_to_reasoning": 1.7708113991552632,
"adv/std_final_conf": 0.7149190306663513,
"adv/std_reasoning": 0.6403000950813293,
"adv/std_step_conf": 0.8437038064002991,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 13.3515625,
"calib/ece": 0.16172199999999998,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.816,
"calib/gap": 0.28898601578586147,
"calib/mean_conf": 0.877882,
"calib/mu_c": 0.9495505319148935,
"calib/mu_w": 0.660564516129032,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14380199999999996,
"calib/std_conf": 0.27151917625832617,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.35282960776043865,
"calib/step_q_c_n": 2371.0,
"calib/step_q_gap": -0.02016370647069793,
"calib/step_q_w": 0.3729933142311366,
"calib/step_q_w_n": 1047.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2213.0,
"completions/max_terminated_length": 2213.0,
"completions/mean_length": 718.49609375,
"completions/mean_terminated_length": 738.6947631835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.1536,
"grad_norm": 1.1345633268356323,
"kl": 0.2506103515625,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.0998,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.022363772615790367,
"mask/share_reasoning": 0.7747567892074585,
"mask/share_step_conf": 0.17553575336933136,
"num_tokens": 44065739.0,
"reward": 0.7319254875183105,
"reward_std": 0.17399638891220093,
"rewards/accuracy_reward_step": 0.734375,
"rewards/final_brier_reward_step": 0.8122310042381287,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.3094324469566345,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.6201844811439514,
"adv/mean_abs_reasoning": 0.4980112612247467,
"adv/mean_abs_step_conf": 0.6163833141326904,
"adv/ratio_final_to_reasoning": 1.2453222033950542,
"adv/ratio_step_to_reasoning": 1.2376895105079235,
"adv/std_final_conf": 0.8292064666748047,
"adv/std_reasoning": 0.7754777669906616,
"adv/std_step_conf": 0.8439358472824097,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 14.78515625,
"calib/ece": 0.2414574898785426,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.8097165991902834,
"calib/gap": 0.13474309664694284,
"calib/mean_conf": 0.8806477732793522,
"calib/mu_c": 0.923198224852071,
"calib/mu_w": 0.7884551282051282,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21894736842105272,
"calib/std_conf": 0.2584896951428465,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3552679582063561,
"calib/step_q_c_n": 2297.0,
"calib/step_q_gap": -0.04995852028826753,
"calib/step_q_w": 0.4052264784946236,
"calib/step_q_w_n": 1488.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2623.0,
"completions/max_terminated_length": 2623.0,
"completions/mean_length": 796.2421875,
"completions/mean_terminated_length": 821.9273681640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 267.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.8662506937980652,
"kl": 0.230865478515625,
"learning_rate": 1.5555555555555558e-06,
"loss": -0.0984,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.01997159980237484,
"mask/share_reasoning": 0.773935079574585,
"mask/share_step_conf": 0.17484331130981445,
"num_tokens": 44372281.0,
"reward": 0.6226569414138794,
"reward_std": 0.24305683374404907,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7108544111251831,
"rewards/format_reward_step": 0.96484375,
"rewards/step_margin_reward": 0.20945948362350464,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.6963587403297424,
"adv/mean_abs_reasoning": 0.5124849081039429,
"adv/mean_abs_step_conf": 0.6940493583679199,
"adv/ratio_final_to_reasoning": 1.358788774690134,
"adv/ratio_step_to_reasoning": 1.354282530847039,
"adv/std_final_conf": 0.8616522550582886,
"adv/std_reasoning": 0.7755056619644165,
"adv/std_step_conf": 0.8758512735366821,
"calib/answer_extract_rate": 0.94140625,
"calib/avg_num_step_conf": 15.2734375,
"calib/ece": 0.297390041493776,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.7468879668049793,
"calib/gap": 0.18462630226915944,
"calib/mean_conf": 0.8753153526970954,
"calib/mu_c": 0.9503916083916084,
"calib/mu_w": 0.765765306122449,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.28967219917012454,
"calib/std_conf": 0.2290983049539202,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.34514803471158756,
"calib/step_q_c_n": 1959.0,
"calib/step_q_gap": -0.03828712674407625,
"calib/step_q_w": 0.3834351614556638,
"calib/step_q_w_n": 1951.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2089.0,
"completions/max_terminated_length": 2089.0,
"completions/mean_length": 775.6171875,
"completions/mean_terminated_length": 823.8921508789062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 322.0,
"epoch": 0.15573333333333333,
"grad_norm": 1.391205072402954,
"kl": 0.225616455078125,
"learning_rate": 1.527777777777778e-06,
"loss": -0.2554,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.019138246774673462,
"mask/share_reasoning": 0.7537720799446106,
"mask/share_step_conf": 0.16849590837955475,
"num_tokens": 44678055.0,
"reward": 0.6086395382881165,
"reward_std": 0.2709296643733978,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6738836765289307,
"rewards/format_reward_step": 0.94140625,
"rewards/step_margin_reward": 0.2433953881263733,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.5746307373046875,
"adv/mean_abs_reasoning": 0.40634775161743164,
"adv/mean_abs_step_conf": 0.5668737888336182,
"adv/ratio_final_to_reasoning": 1.414135392696084,
"adv/ratio_step_to_reasoning": 1.3950459589778132,
"adv/std_final_conf": 0.7812044620513916,
"adv/std_reasoning": 0.7015212178230286,
"adv/std_step_conf": 0.7938024997711182,
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 14.22265625,
"calib/ece": 0.19931224489795923,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.6408163265306123,
"calib/gap": 0.25400596760443295,
"calib/mean_conf": 0.7928632653061224,
"calib/mu_c": 0.8882450980392157,
"calib/mu_w": 0.6342391304347827,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1838428571428572,
"calib/std_conf": 0.2943720654822338,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3583738508682329,
"calib/step_q_c_n": 1958.0,
"calib/step_q_gap": -0.03442353475268212,
"calib/step_q_w": 0.39279738562091504,
"calib/step_q_w_n": 1683.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2033.0,
"completions/max_terminated_length": 2033.0,
"completions/mean_length": 764.7734375,
"completions/mean_terminated_length": 799.1101684570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 261.0,
"epoch": 0.1568,
"grad_norm": 1.6528960466384888,
"kl": 0.23443603515625,
"learning_rate": 1.5e-06,
"loss": -0.1632,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.020229840651154518,
"mask/share_reasoning": 0.7700471878051758,
"mask/share_step_conf": 0.16675424575805664,
"num_tokens": 44977517.0,
"reward": 0.6164664626121521,
"reward_std": 0.19761189818382263,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7365533709526062,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.1854419857263565,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.5271502137184143,
"adv/mean_abs_reasoning": 0.3972645401954651,
"adv/mean_abs_step_conf": 0.5814423561096191,
"adv/ratio_final_to_reasoning": 1.3269500808177892,
"adv/ratio_step_to_reasoning": 1.4636150405559316,
"adv/std_final_conf": 0.754736065864563,
"adv/std_reasoning": 0.6613168120384216,
"adv/std_step_conf": 0.8106851577758789,
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 14.140625,
"calib/ece": 0.13375510204081634,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.726530612244898,
"calib/gap": 0.27694385686585565,
"calib/mean_conf": 0.8571020408163265,
"calib/mu_c": 0.9271857923497266,
"calib/mu_w": 0.650241935483871,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12195918367346939,
"calib/std_conf": 0.2374820504407635,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.41528641425389756,
"calib/step_q_c_n": 2245.0,
"calib/step_q_gap": -0.003044494837011469,
"calib/step_q_w": 0.41833090909090903,
"calib/step_q_w_n": 1375.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2535.0,
"completions/max_terminated_length": 2535.0,
"completions/mean_length": 701.2734375,
"completions/mean_terminated_length": 732.7591552734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 213.0,
"epoch": 0.15786666666666666,
"grad_norm": 1.5334172248840332,
"kl": 0.25244140625,
"learning_rate": 1.4722222222222225e-06,
"loss": -0.0908,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.02250342071056366,
"mask/share_reasoning": 0.7561869025230408,
"mask/share_step_conf": 0.17834091186523438,
"num_tokens": 45262155.0,
"reward": 0.7265738248825073,
"reward_std": 0.17858600616455078,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.8107410073280334,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.3080315589904785,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.6013184189796448,
"adv/mean_abs_reasoning": 0.44574636220932007,
"adv/mean_abs_step_conf": 0.685305118560791,
"adv/ratio_final_to_reasoning": 1.3490147535904486,
"adv/ratio_step_to_reasoning": 1.5374328915756255,
"adv/std_final_conf": 0.8094089031219482,
"adv/std_reasoning": 0.7206778526306152,
"adv/std_step_conf": 0.891316831111908,
"calib/answer_extract_rate": 0.9296875,
"calib/avg_num_step_conf": 15.16796875,
"calib/ece": 0.1078151260504202,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.6764705882352942,
"calib/gap": 0.3305325670498085,
"calib/mean_conf": 0.810672268907563,
"calib/mu_c": 0.8912222222222224,
"calib/mu_w": 0.5606896551724139,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08109243697478997,
"calib/std_conf": 0.28979123489025993,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37588042515500447,
"calib/step_q_c_n": 2258.0,
"calib/step_q_gap": -0.03427043691396109,
"calib/step_q_w": 0.41015086206896556,
"calib/step_q_w_n": 1624.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2708.0,
"completions/max_terminated_length": 2708.0,
"completions/mean_length": 757.67578125,
"completions/mean_terminated_length": 808.1875610351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 264.0,
"epoch": 0.15893333333333334,
"grad_norm": 1.1942827701568604,
"kl": 0.230194091796875,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.1477,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.019954249262809753,
"mask/share_reasoning": 0.7544931173324585,
"mask/share_step_conf": 0.16305264830589294,
"num_tokens": 45560576.0,
"reward": 0.6850230693817139,
"reward_std": 0.22184012830257416,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7878589630126953,
"rewards/format_reward_step": 0.92578125,
"rewards/step_margin_reward": 0.2564058005809784,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.5935416221618652,
"adv/mean_abs_reasoning": 0.30029815435409546,
"adv/mean_abs_step_conf": 0.6016957759857178,
"adv/ratio_final_to_reasoning": 1.9765077259249246,
"adv/ratio_step_to_reasoning": 2.003661252197476,
"adv/std_final_conf": 0.8034288287162781,
"adv/std_reasoning": 0.596147894859314,
"adv/std_step_conf": 0.827460527420044,
"calib/answer_extract_rate": 0.9453125,
"calib/avg_num_step_conf": 13.7734375,
"calib/ece": 0.14944214876033057,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.6942148760330579,
"calib/gap": 0.2672503170577044,
"calib/mean_conf": 0.8287809917355372,
"calib/mu_c": 0.912710843373494,
"calib/mu_w": 0.6454605263157895,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14613636363636365,
"calib/std_conf": 0.26962259660006904,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4318985823336969,
"calib/step_q_c_n": 1834.0,
"calib/step_q_gap": -0.01707937274904553,
"calib/step_q_w": 0.4489779550827424,
"calib/step_q_w_n": 1692.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2401.0,
"completions/max_terminated_length": 2401.0,
"completions/mean_length": 642.1953125,
"completions/mean_terminated_length": 679.3471069335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.16,
"grad_norm": 1.7494480609893799,
"kl": 0.28179931640625,
"learning_rate": 1.4166666666666667e-06,
"loss": -0.1658,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.023424550890922546,
"mask/share_reasoning": 0.7520290613174438,
"mask/share_step_conf": 0.1698589324951172,
"num_tokens": 45829938.0,
"reward": 0.6518896222114563,
"reward_std": 0.17572720348834991,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7625117897987366,
"rewards/format_reward_step": 0.9453125,
"rewards/step_margin_reward": 0.2225174605846405,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.6138310432434082,
"adv/mean_abs_reasoning": 0.4014820456504822,
"adv/mean_abs_step_conf": 0.498024582862854,
"adv/ratio_final_to_reasoning": 1.528912811652331,
"adv/ratio_step_to_reasoning": 1.2404653913127133,
"adv/std_final_conf": 0.8315180540084839,
"adv/std_reasoning": 0.7014200091362,
"adv/std_step_conf": 0.7584320902824402,
"calib/answer_extract_rate": 0.9296875,
"calib/avg_num_step_conf": 16.0390625,
"calib/ece": 0.14950840336134447,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.5630252100840336,
"calib/gap": 0.27281709956709954,
"calib/mean_conf": 0.7419453781512605,
"calib/mu_c": 0.8382337662337663,
"calib/mu_w": 0.5654166666666668,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12219747899159655,
"calib/std_conf": 0.31073299280052796,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4112959486166008,
"calib/step_q_c_n": 2024.0,
"calib/step_q_gap": -0.0113418515755222,
"calib/step_q_w": 0.422637800192123,
"calib/step_q_w_n": 2082.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2941.0,
"completions/max_terminated_length": 2941.0,
"completions/mean_length": 759.1875,
"completions/mean_terminated_length": 813.1882934570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 346.0,
"epoch": 0.16106666666666666,
"grad_norm": 1.1843147277832031,
"kl": 0.229949951171875,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.2223,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.019284039735794067,
"mask/share_reasoning": 0.7494036555290222,
"mask/share_step_conf": 0.1649060696363449,
"num_tokens": 46131314.0,
"reward": 0.5891934633255005,
"reward_std": 0.1657007783651352,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7350819110870361,
"rewards/format_reward_step": 0.9296875,
"rewards/step_margin_reward": 0.1370549350976944,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.6327756643295288,
"adv/mean_abs_reasoning": 0.4947320222854614,
"adv/mean_abs_step_conf": 0.6735621690750122,
"adv/ratio_final_to_reasoning": 1.279027101189775,
"adv/ratio_step_to_reasoning": 1.3614687118157986,
"adv/std_final_conf": 0.8337776064872742,
"adv/std_reasoning": 0.7577208280563354,
"adv/std_step_conf": 0.8758518695831299,
"calib/answer_extract_rate": 0.93359375,
"calib/avg_num_step_conf": 14.60546875,
"calib/ece": 0.121781512605042,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.542016806722689,
"calib/gap": 0.27947975151372173,
"calib/mean_conf": 0.7352268907563024,
"calib/mu_c": 0.8303439490445859,
"calib/mu_w": 0.5508641975308641,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09867226890756299,
"calib/std_conf": 0.309116956924721,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4365304391217565,
"calib/step_q_c_n": 2004.0,
"calib/step_q_gap": 0.0100405255770879,
"calib/step_q_w": 0.4264899135446686,
"calib/step_q_w_n": 1735.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2986.0,
"completions/max_terminated_length": 2986.0,
"completions/mean_length": 738.37890625,
"completions/mean_terminated_length": 790.8995361328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 305.0,
"epoch": 0.16213333333333332,
"grad_norm": 1.4904396533966064,
"kl": 0.239471435546875,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.2374,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.02063142880797386,
"mask/share_reasoning": 0.7482746839523315,
"mask/share_step_conf": 0.1646876186132431,
"num_tokens": 46425731.0,
"reward": 0.6711384057998657,
"reward_std": 0.20814509689807892,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7434897422790527,
"rewards/format_reward_step": 0.9296875,
"rewards/step_margin_reward": 0.28941214084625244,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.6002535820007324,
"adv/mean_abs_reasoning": 0.3623543977737427,
"adv/mean_abs_step_conf": 0.620030403137207,
"adv/ratio_final_to_reasoning": 1.6565373173020963,
"adv/ratio_step_to_reasoning": 1.7111159874051247,
"adv/std_final_conf": 0.8012081384658813,
"adv/std_reasoning": 0.6612955927848816,
"adv/std_step_conf": 0.8599606156349182,
"calib/answer_extract_rate": 0.9375,
"calib/avg_num_step_conf": 15.05078125,
"calib/ece": 0.14890041493775932,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.6390041493775933,
"calib/gap": 0.26141066732412876,
"calib/mean_conf": 0.7865767634854771,
"calib/mu_c": 0.8646745562130177,
"calib/mu_w": 0.6032638888888889,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1171161825726141,
"calib/std_conf": 0.3065826960243486,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4296149809160305,
"calib/step_q_c_n": 2096.0,
"calib/step_q_gap": 0.002485328098728312,
"calib/step_q_w": 0.4271296528173022,
"calib/step_q_w_n": 1757.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2956.0,
"completions/max_terminated_length": 2956.0,
"completions/mean_length": 775.05078125,
"completions/mean_terminated_length": 819.8883666992188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 287.0,
"epoch": 0.1632,
"grad_norm": 1.214044213294983,
"kl": 0.223876953125,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.0627,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.018918223679065704,
"mask/share_reasoning": 0.7644103169441223,
"mask/share_step_conf": 0.16198396682739258,
"num_tokens": 46731464.0,
"reward": 0.669912576675415,
"reward_std": 0.19476324319839478,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.749024510383606,
"rewards/format_reward_step": 0.9375,
"rewards/step_margin_reward": 0.2712693512439728,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.5115103721618652,
"adv/mean_abs_reasoning": 0.4170297682285309,
"adv/mean_abs_step_conf": 0.6089221239089966,
"adv/ratio_final_to_reasoning": 1.2265560186139022,
"adv/ratio_step_to_reasoning": 1.4601406669255068,
"adv/std_final_conf": 0.7640756368637085,
"adv/std_reasoning": 0.7015310525894165,
"adv/std_step_conf": 0.8599308729171753,
"calib/answer_extract_rate": 0.8984375,
"calib/avg_num_step_conf": 15.87890625,
"calib/ece": 0.17054347826086955,
"calib/final_conf_rate": 0.8984375,
"calib/format_rate": 0.8984375,
"calib/frac_conf_gt_0.9": 0.7,
"calib/gap": 0.4025906394199076,
"calib/mean_conf": 0.8080217391304347,
"calib/mu_c": 0.9515540540540539,
"calib/mu_w": 0.5489634146341463,
"calib/nonempty_final_conf_rate": 0.8984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16754347826086954,
"calib/std_conf": 0.313698656920136,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4056774020397209,
"calib/step_q_c_n": 1863.0,
"calib/step_q_gap": -0.036074641557009346,
"calib/step_q_w": 0.44175204359673026,
"calib/step_q_w_n": 2202.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2961.0,
"completions/max_terminated_length": 2961.0,
"completions/mean_length": 703.546875,
"completions/mean_terminated_length": 776.3275756835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 313.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.8250810503959656,
"kl": 0.239288330078125,
"learning_rate": 1.3055555555555556e-06,
"loss": -0.2055,
"mask/has_final_conf_rate": 0.8984375,
"mask/share_final_conf": 0.01885397545993328,
"mask/share_reasoning": 0.7259734869003296,
"mask/share_step_conf": 0.16142255067825317,
"num_tokens": 47016012.0,
"reward": 0.5886725783348083,
"reward_std": 0.21505558490753174,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7455453872680664,
"rewards/format_reward_step": 0.8984375,
"rewards/step_margin_reward": 0.13648727536201477,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.6118817925453186,
"adv/mean_abs_reasoning": 0.47593390941619873,
"adv/mean_abs_step_conf": 0.6540273427963257,
"adv/ratio_final_to_reasoning": 1.2856444570126964,
"adv/ratio_step_to_reasoning": 1.3741978242285449,
"adv/std_final_conf": 0.8000847697257996,
"adv/std_reasoning": 0.7207589149475098,
"adv/std_step_conf": 0.8598897457122803,
"calib/answer_extract_rate": 0.9453125,
"calib/avg_num_step_conf": 14.05078125,
"calib/ece": 0.2131818181818182,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.6735537190082644,
"calib/gap": 0.30535830266291397,
"calib/mean_conf": 0.8160743801652893,
"calib/mu_c": 0.9334228187919462,
"calib/mu_w": 0.6280645161290322,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20677685950413222,
"calib/std_conf": 0.28303673987749994,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.404281045751634,
"calib/step_q_c_n": 1836.0,
"calib/step_q_gap": 0.005331017358675416,
"calib/step_q_w": 0.39895002839295857,
"calib/step_q_w_n": 1761.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2028.0,
"completions/max_terminated_length": 2028.0,
"completions/mean_length": 698.140625,
"completions/mean_terminated_length": 738.5288696289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 310.0,
"epoch": 0.16533333333333333,
"grad_norm": 1.3057198524475098,
"kl": 0.243377685546875,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.1336,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.020220771431922913,
"mask/share_reasoning": 0.753272294998169,
"mask/share_step_conf": 0.17181944847106934,
"num_tokens": 47301952.0,
"reward": 0.629758358001709,
"reward_std": 0.2366112768650055,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7445582151412964,
"rewards/format_reward_step": 0.9453125,
"rewards/step_margin_reward": 0.20948976278305054,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.5225203037261963,
"adv/mean_abs_reasoning": 0.36599862575531006,
"adv/mean_abs_step_conf": 0.6580045223236084,
"adv/ratio_final_to_reasoning": 1.4276564635942908,
"adv/ratio_step_to_reasoning": 1.797833314170748,
"adv/std_final_conf": 0.7769670486450195,
"adv/std_reasoning": 0.6816434860229492,
"adv/std_step_conf": 0.8757668137550354,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 13.703125,
"calib/ece": 0.20048387096774195,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.657258064516129,
"calib/gap": 0.21715750232991615,
"calib/mean_conf": 0.7933064516129033,
"calib/mu_c": 0.858103448275862,
"calib/mu_w": 0.6409459459459459,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14608870967741938,
"calib/std_conf": 0.31163938727367596,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42193717277486914,
"calib/step_q_c_n": 2101.0,
"calib/step_q_gap": 0.02631172856733538,
"calib/step_q_w": 0.39562544420753376,
"calib/step_q_w_n": 1407.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2892.0,
"completions/max_terminated_length": 2892.0,
"completions/mean_length": 771.7578125,
"completions/mean_terminated_length": 793.4537963867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 266.0,
"epoch": 0.1664,
"grad_norm": 1.2064683437347412,
"kl": 0.23663330078125,
"learning_rate": 1.25e-06,
"loss": -0.0808,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.021276123821735382,
"mask/share_reasoning": 0.779464840888977,
"mask/share_step_conf": 0.17191532254219055,
"num_tokens": 47604282.0,
"reward": 0.6614023447036743,
"reward_std": 0.17743799090385437,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7517943382263184,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.24132287502288818,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.529394805431366,
"adv/mean_abs_reasoning": 0.47357791662216187,
"adv/mean_abs_step_conf": 0.6258033514022827,
"adv/ratio_final_to_reasoning": 1.1178621022013087,
"adv/ratio_step_to_reasoning": 1.3214369366415621,
"adv/std_final_conf": 0.777289867401123,
"adv/std_reasoning": 0.757584273815155,
"adv/std_step_conf": 0.8599728941917419,
"calib/answer_extract_rate": 0.92578125,
"calib/avg_num_step_conf": 15.8203125,
"calib/ece": 0.16551476793248956,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.7805907172995781,
"calib/gap": 0.3085774451792551,
"calib/mean_conf": 0.8546286919831223,
"calib/mu_c": 0.9431656804733728,
"calib/mu_w": 0.6345882352941177,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15353164556962037,
"calib/std_conf": 0.29181688470317674,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4209602349751469,
"calib/step_q_c_n": 2213.0,
"calib/step_q_gap": -0.069750706777711,
"calib/step_q_w": 0.4907109417528579,
"calib/step_q_w_n": 1837.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2756.0,
"completions/max_terminated_length": 2756.0,
"completions/mean_length": 703.27734375,
"completions/mean_terminated_length": 756.4664306640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.16746666666666668,
"grad_norm": 1.2208685874938965,
"kl": 0.237213134765625,
"learning_rate": 1.2222222222222223e-06,
"loss": -0.1109,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.021750878542661667,
"mask/share_reasoning": 0.7368344068527222,
"mask/share_step_conf": 0.17110225558280945,
"num_tokens": 47888049.0,
"reward": 0.629758358001709,
"reward_std": 0.22569067776203156,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7558800578117371,
"rewards/format_reward_step": 0.92578125,
"rewards/step_margin_reward": 0.18644914031028748,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.499210000038147,
"adv/mean_abs_reasoning": 0.45436128973960876,
"adv/mean_abs_step_conf": 0.7095964550971985,
"adv/ratio_final_to_reasoning": 1.098707155101705,
"adv/ratio_step_to_reasoning": 1.5617449618207202,
"adv/std_final_conf": 0.7592298984527588,
"adv/std_reasoning": 0.7206823825836182,
"adv/std_step_conf": 0.8913768529891968,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 13.5546875,
"calib/ece": 0.20483669354838707,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.8185483870967742,
"calib/gap": 0.20857639921722104,
"calib/mean_conf": 0.886530241935484,
"calib/mu_c": 0.9479257142857143,
"calib/mu_w": 0.7393493150684932,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19286088709677415,
"calib/std_conf": 0.25592254469626635,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42429733840304185,
"calib/step_q_c_n": 2104.0,
"calib/step_q_gap": -0.057404345345128005,
"calib/step_q_w": 0.48170168374816985,
"calib/step_q_w_n": 1366.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2309.0,
"completions/max_terminated_length": 2309.0,
"completions/mean_length": 706.2109375,
"completions/mean_terminated_length": 728.991943359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 247.0,
"epoch": 0.16853333333333334,
"grad_norm": 1.058288335800171,
"kl": 0.25054931640625,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.0759,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.0231400728225708,
"mask/share_reasoning": 0.7669098377227783,
"mask/share_step_conf": 0.1787000596523285,
"num_tokens": 48174079.0,
"reward": 0.6625410914421082,
"reward_std": 0.2515299916267395,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7563234567642212,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.23828986287117004,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.4797705411911011,
"adv/mean_abs_reasoning": 0.42208725214004517,
"adv/mean_abs_step_conf": 0.6270284652709961,
"adv/ratio_final_to_reasoning": 1.1366620023670297,
"adv/ratio_step_to_reasoning": 1.4855422950867825,
"adv/std_final_conf": 0.7398706078529358,
"adv/std_reasoning": 0.7013546228408813,
"adv/std_step_conf": 0.860073447227478,
"calib/answer_extract_rate": 0.94921875,
"calib/avg_num_step_conf": 14.24609375,
"calib/ece": 0.17094650205761305,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.7613168724279835,
"calib/gap": 0.1921494607087827,
"calib/mean_conf": 0.8795061728395062,
"calib/mu_c": 0.9316949152542373,
"calib/mu_w": 0.7395454545454546,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16102880658436203,
"calib/std_conf": 0.24471820310939163,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.418492017208413,
"calib/step_q_c_n": 2092.0,
"calib/step_q_gap": -0.09835685738965771,
"calib/step_q_w": 0.5168488745980707,
"calib/step_q_w_n": 1555.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 1897.0,
"completions/max_terminated_length": 1897.0,
"completions/mean_length": 673.609375,
"completions/mean_terminated_length": 709.6460571289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 289.0,
"epoch": 0.1696,
"grad_norm": 1.5005854368209839,
"kl": 0.256011962890625,
"learning_rate": 1.1666666666666668e-06,
"loss": -0.1241,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.022273514419794083,
"mask/share_reasoning": 0.7569800615310669,
"mask/share_step_conf": 0.16996517777442932,
"num_tokens": 48451307.0,
"reward": 0.6668627262115479,
"reward_std": 0.2029997706413269,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.7550758123397827,
"rewards/format_reward_step": 0.94921875,
"rewards/step_margin_reward": 0.25052475929260254,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.49061745405197144,
"adv/mean_abs_reasoning": 0.4065553843975067,
"adv/mean_abs_step_conf": 0.6536253690719604,
"adv/ratio_final_to_reasoning": 1.2067665879743303,
"adv/ratio_step_to_reasoning": 1.6077154408878342,
"adv/std_final_conf": 0.7587159872055054,
"adv/std_reasoning": 0.6817222237586975,
"adv/std_step_conf": 0.8598037958145142,
"calib/answer_extract_rate": 0.9375,
"calib/avg_num_step_conf": 13.7578125,
"calib/ece": 0.22365145228215771,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.7385892116182573,
"calib/gap": 0.16865381526104417,
"calib/mean_conf": 0.8376348547717842,
"calib/mu_c": 0.8901204819277108,
"calib/mu_w": 0.7214666666666666,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18624481327800835,
"calib/std_conf": 0.2938403034479326,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4180683289124668,
"calib/step_q_c_n": 1885.0,
"calib/step_q_gap": 0.013894840824501042,
"calib/step_q_w": 0.40417348808796577,
"calib/step_q_w_n": 1637.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2213.0,
"completions/max_terminated_length": 2213.0,
"completions/mean_length": 672.109375,
"completions/mean_terminated_length": 713.9419555664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 268.0,
"epoch": 0.17066666666666666,
"grad_norm": 1.0761736631393433,
"kl": 0.245941162109375,
"learning_rate": 1.138888888888889e-06,
"loss": -0.0619,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.020725876092910767,
"mask/share_reasoning": 0.7537473440170288,
"mask/share_step_conf": 0.166933074593544,
"num_tokens": 48728207.0,
"reward": 0.5751692652702332,
"reward_std": 0.19423778355121613,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7055008411407471,
"rewards/format_reward_step": 0.9375,
"rewards/step_margin_reward": 0.1276501715183258,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.3587230443954468,
"adv/mean_abs_reasoning": 0.23824524879455566,
"adv/mean_abs_step_conf": 0.6313453912734985,
"adv/ratio_final_to_reasoning": 1.5056881352743445,
"adv/ratio_step_to_reasoning": 2.649981036213327,
"adv/std_final_conf": 0.6413493156433105,
"adv/std_reasoning": 0.5482035875320435,
"adv/std_step_conf": 0.8753535151481628,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 12.12109375,
"calib/ece": 0.11637795275590558,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.8385826771653543,
"calib/gap": 0.2968382279741165,
"calib/mean_conf": 0.8893700787401575,
"calib/mu_c": 0.9466341463414635,
"calib/mu_w": 0.649795918367347,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0993307086614174,
"calib/std_conf": 0.26446052064293823,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3838304777594728,
"calib/step_q_c_n": 2428.0,
"calib/step_q_gap": 0.008682329611324702,
"calib/step_q_w": 0.3751481481481481,
"calib/step_q_w_n": 675.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1935.0,
"completions/max_terminated_length": 1935.0,
"completions/mean_length": 696.765625,
"completions/mean_terminated_length": 702.251953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 215.0,
"epoch": 0.17173333333333332,
"grad_norm": 1.3555985689163208,
"kl": 0.246795654296875,
"learning_rate": 1.111111111111111e-06,
"loss": -0.0074,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.023145297542214394,
"mask/share_reasoning": 0.7910975217819214,
"mask/share_step_conf": 0.17794470489025116,
"num_tokens": 49010499.0,
"reward": 0.7200655341148376,
"reward_std": 0.16070488095283508,
"rewards/accuracy_reward_step": 0.80078125,
"rewards/final_brier_reward_step": 0.8533073663711548,
"rewards/format_reward_step": 0.9921875,
"rewards/step_margin_reward": 0.22822979092597961,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.37374114990234375,
"adv/mean_abs_reasoning": 0.28832659125328064,
"adv/mean_abs_step_conf": 0.599334716796875,
"adv/ratio_final_to_reasoning": 1.2962423905397982,
"adv/ratio_step_to_reasoning": 2.0786661202205563,
"adv/std_final_conf": 0.6621988415718079,
"adv/std_reasoning": 0.5960350632667542,
"adv/std_step_conf": 0.827337920665741,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 12.45703125,
"calib/ece": 0.1290200000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.852,
"calib/gap": 0.23135292923139394,
"calib/mean_conf": 0.9166200000000001,
"calib/mu_c": 0.9619651741293532,
"calib/mu_w": 0.7306122448979593,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12082000000000012,
"calib/std_conf": 0.21414919005216898,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.36848850325379606,
"calib/step_q_c_n": 2305.0,
"calib/step_q_gap": -0.05397360081860214,
"calib/step_q_w": 0.4224621040723982,
"calib/step_q_w_n": 884.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2148.0,
"completions/max_terminated_length": 2148.0,
"completions/mean_length": 676.40234375,
"completions/mean_terminated_length": 692.6360473632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 266.0,
"epoch": 0.1728,
"grad_norm": 1.0031044483184814,
"kl": 0.25445556640625,
"learning_rate": 1.0833333333333335e-06,
"loss": -0.0842,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.023875439539551735,
"mask/share_reasoning": 0.7759991884231567,
"mask/share_step_conf": 0.1766878217458725,
"num_tokens": 49287802.0,
"reward": 0.7327048778533936,
"reward_std": 0.16267752647399902,
"rewards/accuracy_reward_step": 0.78515625,
"rewards/final_brier_reward_step": 0.8367069363594055,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.2763589918613434,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.4780116081237793,
"adv/mean_abs_reasoning": 0.4224798083305359,
"adv/mean_abs_step_conf": 0.5790718793869019,
"adv/ratio_final_to_reasoning": 1.131442494287909,
"adv/ratio_step_to_reasoning": 1.3706498345451172,
"adv/std_final_conf": 0.7584898471832275,
"adv/std_reasoning": 0.7206315994262695,
"adv/std_step_conf": 0.8108139038085938,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 14.765625,
"calib/ece": 0.19218623481781372,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.7206477732793523,
"calib/gap": 0.41208835904628327,
"calib/mean_conf": 0.8083805668016194,
"calib/mu_c": 0.9618709677419355,
"calib/mu_w": 0.5497826086956522,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18651821862348172,
"calib/std_conf": 0.3330636358615001,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40701116725619,
"calib/step_q_c_n": 1979.0,
"calib/step_q_gap": -0.03158239187762446,
"calib/step_q_w": 0.4385935591338145,
"calib/step_q_w_n": 1801.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2508.0,
"completions/max_terminated_length": 2508.0,
"completions/mean_length": 760.453125,
"completions/mean_terminated_length": 788.1619873046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.8664852380752563,
"kl": 0.232330322265625,
"learning_rate": 1.0555555555555557e-06,
"loss": -0.1173,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.02123209834098816,
"mask/share_reasoning": 0.7678343057632446,
"mask/share_step_conf": 0.17577733099460602,
"num_tokens": 49587310.0,
"reward": 0.676498532295227,
"reward_std": 0.19726118445396423,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7866039276123047,
"rewards/format_reward_step": 0.96484375,
"rewards/step_margin_reward": 0.2523307800292969,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.4758383333683014,
"adv/mean_abs_reasoning": 0.38787057995796204,
"adv/mean_abs_step_conf": 0.6178101301193237,
"adv/ratio_final_to_reasoning": 1.226796663515633,
"adv/ratio_step_to_reasoning": 1.5928254475662549,
"adv/std_final_conf": 0.7404839396476746,
"adv/std_reasoning": 0.6816709637641907,
"adv/std_step_conf": 0.8437954783439636,
"calib/answer_extract_rate": 0.9609375,
"calib/avg_num_step_conf": 14.21875,
"calib/ece": 0.17573170731707322,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.7073170731707317,
"calib/gap": 0.40962879064669333,
"calib/mean_conf": 0.8125609756097562,
"calib/mu_c": 0.9540993788819875,
"calib/mu_w": 0.5444705882352942,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1669105691056911,
"calib/std_conf": 0.316181000605292,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4039997516145057,
"calib/step_q_c_n": 2013.0,
"calib/step_q_gap": 0.022508049094530358,
"calib/step_q_w": 0.38149170251997533,
"calib/step_q_w_n": 1627.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 3013.0,
"completions/max_terminated_length": 3013.0,
"completions/mean_length": 801.28515625,
"completions/mean_terminated_length": 830.4818115234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 320.0,
"epoch": 0.17493333333333333,
"grad_norm": 1.2860913276672363,
"kl": 0.222442626953125,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.1324,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01892763376235962,
"mask/share_reasoning": 0.7816683053970337,
"mask/share_step_conf": 0.1642477661371231,
"num_tokens": 49898575.0,
"reward": 0.6890560984611511,
"reward_std": 0.20963336527347565,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.8015799522399902,
"rewards/format_reward_step": 0.9609375,
"rewards/step_margin_reward": 0.2585635185241699,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.503420352935791,
"adv/mean_abs_reasoning": 0.4133051931858063,
"adv/mean_abs_step_conf": 0.6522657871246338,
"adv/ratio_final_to_reasoning": 1.2180353918501876,
"adv/ratio_step_to_reasoning": 1.5781698315883486,
"adv/std_final_conf": 0.7587146759033203,
"adv/std_reasoning": 0.7014722228050232,
"adv/std_step_conf": 0.8600209951400757,
"calib/answer_extract_rate": 0.9453125,
"calib/avg_num_step_conf": 14.3125,
"calib/ece": 0.21150000000000008,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.743801652892562,
"calib/gap": 0.304159090909091,
"calib/mean_conf": 0.8329876033057851,
"calib/mu_c": 0.9435909090909091,
"calib/mu_w": 0.6394318181818182,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20406198347107446,
"calib/std_conf": 0.3019152976478005,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.361900518134715,
"calib/step_q_c_n": 1930.0,
"calib/step_q_gap": -0.0336219732147659,
"calib/step_q_w": 0.3955224913494809,
"calib/step_q_w_n": 1734.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2524.0,
"completions/max_terminated_length": 2524.0,
"completions/mean_length": 752.28125,
"completions/mean_terminated_length": 795.8016357421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 281.0,
"epoch": 0.176,
"grad_norm": 0.7004601955413818,
"kl": 0.222503662109375,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.1832,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.019354721531271935,
"mask/share_reasoning": 0.7650870084762573,
"mask/share_step_conf": 0.16087083518505096,
"num_tokens": 50196735.0,
"reward": 0.6619113683700562,
"reward_std": 0.22242799401283264,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7369174957275391,
"rewards/format_reward_step": 0.9453125,
"rewards/step_margin_reward": 0.2775302827358246,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.44442370533943176,
"adv/mean_abs_reasoning": 0.3372817635536194,
"adv/mean_abs_step_conf": 0.6248372793197632,
"adv/ratio_final_to_reasoning": 1.3176630146170933,
"adv/ratio_step_to_reasoning": 1.8525676358438208,
"adv/std_final_conf": 0.721575140953064,
"adv/std_reasoning": 0.6404114365577698,
"adv/std_step_conf": 0.8596287369728088,
"calib/answer_extract_rate": 0.9609375,
"calib/avg_num_step_conf": 15.265625,
"calib/ece": 0.12714634146341472,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.7276422764227642,
"calib/gap": 0.44507352316192117,
"calib/mean_conf": 0.8217804878048781,
"calib/mu_c": 0.9393812154696134,
"calib/mu_w": 0.4943076923076922,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10657723577235781,
"calib/std_conf": 0.30992035527074396,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37760216216216214,
"calib/step_q_c_n": 2405.0,
"calib/step_q_gap": -0.01938519645393899,
"calib/step_q_w": 0.3969873586161011,
"calib/step_q_w_n": 1503.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2607.0,
"completions/max_terminated_length": 2607.0,
"completions/mean_length": 763.8359375,
"completions/mean_terminated_length": 794.8861694335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 282.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.934593915939331,
"kl": 0.22467041015625,
"learning_rate": 9.722222222222224e-07,
"loss": -0.1519,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.020230621099472046,
"mask/share_reasoning": 0.7602050304412842,
"mask/share_step_conf": 0.18050184845924377,
"num_tokens": 50498461.0,
"reward": 0.7322015762329102,
"reward_std": 0.18576332926750183,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.8410079479217529,
"rewards/format_reward_step": 0.9609375,
"rewards/step_margin_reward": 0.2898014187812805,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.3625107407569885,
"adv/mean_abs_reasoning": 0.3200824558734894,
"adv/mean_abs_step_conf": 0.6579821705818176,
"adv/ratio_final_to_reasoning": 1.1325542344009902,
"adv/ratio_step_to_reasoning": 2.055664590507519,
"adv/std_final_conf": 0.6561827659606934,
"adv/std_reasoning": 0.6185396313667297,
"adv/std_step_conf": 0.8756250143051147,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 13.37890625,
"calib/ece": 0.16047430830039533,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.849802371541502,
"calib/gap": 0.2274734785036293,
"calib/mean_conf": 0.901699604743083,
"calib/mu_c": 0.9502512562814072,
"calib/mu_w": 0.7227777777777779,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.137806324110672,
"calib/std_conf": 0.25181530004313113,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3782980023734177,
"calib/step_q_c_n": 2528.0,
"calib/step_q_gap": -0.03730233207474282,
"calib/step_q_w": 0.4156003344481605,
"calib/step_q_w_n": 897.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1662.0,
"completions/max_terminated_length": 1662.0,
"completions/mean_length": 745.140625,
"completions/mean_terminated_length": 753.976318359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 284.0,
"epoch": 0.17813333333333334,
"grad_norm": 1.029808521270752,
"kl": 0.228607177734375,
"learning_rate": 9.444444444444445e-07,
"loss": 0.0249,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02139473706483841,
"mask/share_reasoning": 0.7871021628379822,
"mask/share_step_conf": 0.17978432774543762,
"num_tokens": 50794825.0,
"reward": 0.6635854840278625,
"reward_std": 0.1748582124710083,
"rewards/accuracy_reward_step": 0.77734375,
"rewards/final_brier_reward_step": 0.8220793008804321,
"rewards/format_reward_step": 0.98828125,
"rewards/step_margin_reward": 0.1519666314125061,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.5207133293151855,
"adv/mean_abs_reasoning": 0.4379308223724365,
"adv/mean_abs_step_conf": 0.6242270469665527,
"adv/ratio_final_to_reasoning": 1.189031012921824,
"adv/ratio_step_to_reasoning": 1.4254010338547976,
"adv/std_final_conf": 0.7697093486785889,
"adv/std_reasoning": 0.7206395268440247,
"adv/std_step_conf": 0.84398353099823,
"calib/answer_extract_rate": 0.94140625,
"calib/avg_num_step_conf": 15.6796875,
"calib/ece": 0.17427385892116182,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.7925311203319502,
"calib/gap": 0.3150412274736485,
"calib/mean_conf": 0.8546058091286306,
"calib/mu_c": 0.9434971098265895,
"calib/mu_w": 0.628455882352941,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1555186721991701,
"calib/std_conf": 0.30617122614199294,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.39054243860404997,
"calib/step_q_c_n": 2321.0,
"calib/step_q_gap": -0.01607640368773977,
"calib/step_q_w": 0.40661884229178974,
"calib/step_q_w_n": 1693.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2946.0,
"completions/max_terminated_length": 2946.0,
"completions/mean_length": 812.65625,
"completions/mean_terminated_length": 856.1316528320312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 276.0,
"epoch": 0.1792,
"grad_norm": 1.4466100931167603,
"kl": 0.209014892578125,
"learning_rate": 9.166666666666666e-07,
"loss": -0.0975,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.01853233389556408,
"mask/share_reasoning": 0.7595194578170776,
"mask/share_step_conf": 0.17116698622703552,
"num_tokens": 51107537.0,
"reward": 0.6733118295669556,
"reward_std": 0.2267422378063202,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7650150060653687,
"rewards/format_reward_step": 0.94140625,
"rewards/step_margin_reward": 0.2581711411476135,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.41139134764671326,
"adv/mean_abs_reasoning": 0.33128395676612854,
"adv/mean_abs_step_conf": 0.564087986946106,
"adv/ratio_final_to_reasoning": 1.2418088447824744,
"adv/ratio_step_to_reasoning": 1.7027325815971417,
"adv/std_final_conf": 0.7026668190956116,
"adv/std_reasoning": 0.6610427498817444,
"adv/std_step_conf": 0.8105635046958923,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 13.74609375,
"calib/ece": 0.17077777777777778,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7103174603174603,
"calib/gap": 0.36775418835103735,
"calib/mean_conf": 0.8118095238095238,
"calib/mu_c": 0.9329349112426035,
"calib/mu_w": 0.5651807228915662,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1559761904761905,
"calib/std_conf": 0.3214193526014844,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37050913636363636,
"calib/step_q_c_n": 2200.0,
"calib/step_q_gap": 0.009315049934523423,
"calib/step_q_w": 0.36119408642911294,
"calib/step_q_w_n": 1319.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1820.0,
"completions/max_terminated_length": 1820.0,
"completions/mean_length": 765.328125,
"completions/mean_terminated_length": 777.4762573242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 256.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.9083700776100159,
"kl": 0.21929931640625,
"learning_rate": 8.88888888888889e-07,
"loss": -0.0294,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.020419418811798096,
"mask/share_reasoning": 0.7880574464797974,
"mask/share_step_conf": 0.17589810490608215,
"num_tokens": 51407645.0,
"reward": 0.7167430520057678,
"reward_std": 0.14973390102386475,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.8055509328842163,
"rewards/format_reward_step": 0.984375,
"rewards/step_margin_reward": 0.2990289330482483,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.5089055299758911,
"adv/mean_abs_reasoning": 0.45473602414131165,
"adv/mean_abs_step_conf": 0.5926600098609924,
"adv/ratio_final_to_reasoning": 1.1191229701602572,
"adv/ratio_step_to_reasoning": 1.303305606764113,
"adv/std_final_conf": 0.7939143180847168,
"adv/std_reasoning": 0.7393459677696228,
"adv/std_step_conf": 0.827715277671814,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 15.0546875,
"calib/ece": 0.12503238866396754,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6963562753036437,
"calib/gap": 0.4970921227197347,
"calib/mean_conf": 0.7695668016194331,
"calib/mu_c": 0.9044055555555556,
"calib/mu_w": 0.4073134328358209,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08292712550607281,
"calib/std_conf": 0.36600016950823094,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.39332050159235665,
"calib/step_q_c_n": 2512.0,
"calib/step_q_gap": 0.024941664036469857,
"calib/step_q_w": 0.3683788375558868,
"calib/step_q_w_n": 1342.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 1833.0,
"completions/max_terminated_length": 1833.0,
"completions/mean_length": 760.1640625,
"completions/mean_terminated_length": 787.8623657226562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 243.0,
"epoch": 0.18133333333333335,
"grad_norm": 1.3216930627822876,
"kl": 0.21307373046875,
"learning_rate": 8.611111111111112e-07,
"loss": -0.1429,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.01945631019771099,
"mask/share_reasoning": 0.7675975561141968,
"mask/share_step_conf": 0.1777898222208023,
"num_tokens": 51706399.0,
"reward": 0.7269425392150879,
"reward_std": 0.22854897379875183,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.832879900932312,
"rewards/format_reward_step": 0.96484375,
"rewards/step_margin_reward": 0.28741124272346497,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.43825238943099976,
"adv/mean_abs_reasoning": 0.3537520468235016,
"adv/mean_abs_step_conf": 0.5394303798675537,
"adv/ratio_final_to_reasoning": 1.2388688443396008,
"adv/ratio_step_to_reasoning": 1.5248827101110545,
"adv/std_final_conf": 0.7006688714027405,
"adv/std_reasoning": 0.6611840724945068,
"adv/std_step_conf": 0.7764024138450623,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 13.52734375,
"calib/ece": 0.24796370967741943,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.7298387096774194,
"calib/gap": 0.27188741721854304,
"calib/mean_conf": 0.8005443548387097,
"calib/mu_c": 0.906887417218543,
"calib/mu_w": 0.635,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21981854838709686,
"calib/std_conf": 0.34674582533534126,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.39197080173347776,
"calib/step_q_c_n": 1846.0,
"calib/step_q_gap": -0.025547441927623038,
"calib/step_q_w": 0.4175182436611008,
"calib/step_q_w_n": 1617.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1847.0,
"completions/max_terminated_length": 1847.0,
"completions/mean_length": 728.2734375,
"completions/mean_terminated_length": 751.76611328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 254.0,
"epoch": 0.1824,
"grad_norm": 1.237929344177246,
"kl": 0.22076416015625,
"learning_rate": 8.333333333333333e-07,
"loss": -0.1205,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.020590022206306458,
"mask/share_reasoning": 0.7791228294372559,
"mask/share_step_conf": 0.1690371185541153,
"num_tokens": 51999733.0,
"reward": 0.6143733263015747,
"reward_std": 0.18360351026058197,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.711430549621582,
"rewards/format_reward_step": 0.96875,
"rewards/step_margin_reward": 0.20559734106063843,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.42707115411758423,
"adv/mean_abs_reasoning": 0.36983591318130493,
"adv/mean_abs_step_conf": 0.6822973489761353,
"adv/ratio_final_to_reasoning": 1.1547584723288373,
"adv/ratio_step_to_reasoning": 1.8448650459795994,
"adv/std_final_conf": 0.7022022008895874,
"adv/std_reasoning": 0.6403645873069763,
"adv/std_step_conf": 0.8912651538848877,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 13.93359375,
"calib/ece": 0.18734,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.816,
"calib/gap": 0.2514656964656967,
"calib/mean_conf": 0.8747,
"calib/mu_c": 0.9400810810810812,
"calib/mu_w": 0.6886153846153845,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16102000000000002,
"calib/std_conf": 0.28317911293031484,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3708603583916084,
"calib/step_q_c_n": 2288.0,
"calib/step_q_gap": -0.056880923860150756,
"calib/step_q_w": 0.42774128225175917,
"calib/step_q_w_n": 1279.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2210.0,
"completions/max_terminated_length": 2210.0,
"completions/mean_length": 730.76171875,
"completions/mean_terminated_length": 751.30517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 260.0,
"epoch": 0.18346666666666667,
"grad_norm": 1.075636863708496,
"kl": 0.22918701171875,
"learning_rate": 8.055555555555557e-07,
"loss": -0.044,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.021154478192329407,
"mask/share_reasoning": 0.7770688533782959,
"mask/share_step_conf": 0.1744329333305359,
"num_tokens": 52290160.0,
"reward": 0.7800790667533875,
"reward_std": 0.18976718187332153,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.7871382236480713,
"rewards/format_reward_step": 0.9765625,
"rewards/step_margin_reward": 0.43317610025405884,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.45317739248275757,
"adv/mean_abs_reasoning": 0.3944540023803711,
"adv/mean_abs_step_conf": 0.5952010750770569,
"adv/ratio_final_to_reasoning": 1.1488725928701813,
"adv/ratio_step_to_reasoning": 1.5089239087073727,
"adv/std_final_conf": 0.7222513556480408,
"adv/std_reasoning": 0.6815750002861023,
"adv/std_step_conf": 0.8438770174980164,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 14.734375,
"calib/ece": 0.21865737051792833,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.796812749003984,
"calib/gap": 0.25239186228482,
"calib/mean_conf": 0.85698406374502,
"calib/mu_c": 0.9283777777777777,
"calib/mu_w": 0.6759859154929577,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17925498007968133,
"calib/std_conf": 0.3073001954190747,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.40082438334007275,
"calib/step_q_c_n": 2473.0,
"calib/step_q_gap": 0.026532543463244462,
"calib/step_q_w": 0.3742918398768283,
"calib/step_q_w_n": 1299.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2875.0,
"completions/max_terminated_length": 2875.0,
"completions/mean_length": 806.23828125,
"completions/mean_terminated_length": 822.298828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.18453333333333333,
"grad_norm": 1.289366602897644,
"kl": 0.21563720703125,
"learning_rate": 7.777777777777779e-07,
"loss": -0.0774,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.02132277935743332,
"mask/share_reasoning": 0.7813730835914612,
"mask/share_step_conf": 0.1777728945016861,
"num_tokens": 52599717.0,
"reward": 0.6934113502502441,
"reward_std": 0.2096310555934906,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.770208477973938,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.27989545464515686,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.6756685972213745,
"adv/mean_abs_reasoning": 0.6307892799377441,
"adv/mean_abs_step_conf": 0.6734330058097839,
"adv/ratio_final_to_reasoning": 1.0711478757027382,
"adv/ratio_step_to_reasoning": 1.067603758066796,
"adv/std_final_conf": 0.8757950663566589,
"adv/std_reasoning": 0.8431174159049988,
"adv/std_step_conf": 0.876012921333313,
"calib/answer_extract_rate": 0.921875,
"calib/avg_num_step_conf": 16.109375,
"calib/ece": 0.2903601694915255,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.7033898305084746,
"calib/gap": 0.11443405889884761,
"calib/mean_conf": 0.7849364406779661,
"calib/mu_c": 0.8193636363636363,
"calib/mu_w": 0.7049295774647887,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18807203389830512,
"calib/std_conf": 0.34952009499837083,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.36908685661764706,
"calib/step_q_c_n": 2176.0,
"calib/step_q_gap": -0.035138593253818184,
"calib/step_q_w": 0.40422544987146525,
"calib/step_q_w_n": 1945.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 3060.0,
"completions/max_terminated_length": 3060.0,
"completions/mean_length": 778.40625,
"completions/mean_terminated_length": 840.81005859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 355.0,
"epoch": 0.1856,
"grad_norm": 1.0049992799758911,
"kl": 0.201873779296875,
"learning_rate": 7.5e-07,
"loss": -0.2353,
"mask/has_final_conf_rate": 0.921875,
"mask/share_final_conf": 0.017570119351148605,
"mask/share_reasoning": 0.751032829284668,
"mask/share_step_conf": 0.15717829763889313,
"num_tokens": 52903221.0,
"reward": 0.6363409757614136,
"reward_std": 0.30102431774139404,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.652944028377533,
"rewards/format_reward_step": 0.91796875,
"rewards/step_margin_reward": 0.3072379529476166,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.5732844471931458,
"adv/mean_abs_reasoning": 0.48877519369125366,
"adv/mean_abs_step_conf": 0.670279860496521,
"adv/ratio_final_to_reasoning": 1.1729000460593635,
"adv/ratio_step_to_reasoning": 1.3713459053323378,
"adv/std_final_conf": 0.7766127586364746,
"adv/std_reasoning": 0.7209515571594238,
"adv/std_step_conf": 0.8758025765419006,
"calib/answer_extract_rate": 0.91796875,
"calib/avg_num_step_conf": 18.12890625,
"calib/ece": 0.24031914893617012,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.548936170212766,
"calib/gap": 0.28600727272727267,
"calib/mean_conf": 0.6724042553191488,
"calib/mu_c": 0.80628,
"calib/mu_w": 0.5202727272727273,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19040425531914884,
"calib/std_conf": 0.39118405417042684,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3666872119815668,
"calib/step_q_c_n": 1736.0,
"calib/step_q_gap": -0.059375438620842824,
"calib/step_q_w": 0.4260626506024096,
"calib/step_q_w_n": 2905.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 2928.0,
"completions/max_terminated_length": 2928.0,
"completions/mean_length": 802.3515625,
"completions/mean_terminated_length": 874.051025390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.18666666666666668,
"grad_norm": 1.099468469619751,
"kl": 0.203277587890625,
"learning_rate": 7.222222222222222e-07,
"loss": -0.2011,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.018801182508468628,
"mask/share_reasoning": 0.7300344705581665,
"mask/share_step_conf": 0.16913309693336487,
"num_tokens": 53214447.0,
"reward": 0.5531612634658813,
"reward_std": 0.21131201088428497,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.6615592241287231,
"rewards/format_reward_step": 0.91796875,
"rewards/step_margin_reward": 0.1635132133960724,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.4588373899459839,
"adv/mean_abs_reasoning": 0.42444854974746704,
"adv/mean_abs_step_conf": 0.6860179305076599,
"adv/ratio_final_to_reasoning": 1.0810200440523994,
"adv/ratio_step_to_reasoning": 1.6162569784154477,
"adv/std_final_conf": 0.7221850752830505,
"adv/std_reasoning": 0.7014881372451782,
"adv/std_step_conf": 0.8756811022758484,
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 15.48046875,
"calib/ece": 0.1927714285714286,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.763265306122449,
"calib/gap": 0.33404130201136184,
"calib/mean_conf": 0.8189755102040817,
"calib/mu_c": 0.9253233532934132,
"calib/mu_w": 0.5912820512820514,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1650571428571429,
"calib/std_conf": 0.3378478334752546,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37296883116883117,
"calib/step_q_c_n": 2310.0,
"calib/step_q_gap": -0.022045929871701175,
"calib/step_q_w": 0.39501476104053235,
"calib/step_q_w_n": 1653.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2584.0,
"completions/max_terminated_length": 2584.0,
"completions/mean_length": 778.71875,
"completions/mean_terminated_length": 813.6815795898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 257.0,
"epoch": 0.18773333333333334,
"grad_norm": 1.1529240608215332,
"kl": 0.2037353515625,
"learning_rate": 6.944444444444446e-07,
"loss": -0.249,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.02091934159398079,
"mask/share_reasoning": 0.7594068646430969,
"mask/share_step_conf": 0.17670507729053497,
"num_tokens": 53517863.0,
"reward": 0.7074384689331055,
"reward_std": 0.21603649854660034,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7608078122138977,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.33219408988952637,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.48085418343544006,
"adv/mean_abs_reasoning": 0.38578104972839355,
"adv/mean_abs_step_conf": 0.6780802607536316,
"adv/ratio_final_to_reasoning": 1.246443244876808,
"adv/ratio_step_to_reasoning": 1.757681620782123,
"adv/std_final_conf": 0.7400456070899963,
"adv/std_reasoning": 0.6816667318344116,
"adv/std_step_conf": 0.875238835811615,
"calib/answer_extract_rate": 0.9375,
"calib/avg_num_step_conf": 14.42578125,
"calib/ece": 0.18491701244813277,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.6970954356846473,
"calib/gap": 0.3424996165055991,
"calib/mean_conf": 0.7964522821576763,
"calib/mu_c": 0.912987421383648,
"calib/mu_w": 0.5704878048780488,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1608091286307054,
"calib/std_conf": 0.3339682924127909,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3727267054468535,
"calib/step_q_c_n": 1891.0,
"calib/step_q_gap": -0.009210586451037728,
"calib/step_q_w": 0.3819372918978912,
"calib/step_q_w_n": 1802.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 1639.0,
"completions/max_terminated_length": 1639.0,
"completions/mean_length": 702.4140625,
"completions/mean_terminated_length": 746.1328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 263.0,
"epoch": 0.1888,
"grad_norm": 1.0421947240829468,
"kl": 0.227264404296875,
"learning_rate": 6.666666666666667e-07,
"loss": -0.2115,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.02060539461672306,
"mask/share_reasoning": 0.7536983489990234,
"mask/share_step_conf": 0.16710247099399567,
"num_tokens": 53801513.0,
"reward": 0.7175785303115845,
"reward_std": 0.1975366175174713,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7486916780471802,
"rewards/format_reward_step": 0.9375,
"rewards/step_margin_reward": 0.37474653124809265,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.5379396080970764,
"adv/mean_abs_reasoning": 0.4493616223335266,
"adv/mean_abs_step_conf": 0.6565547585487366,
"adv/ratio_final_to_reasoning": 1.1971196055941893,
"adv/ratio_step_to_reasoning": 1.4610832922029697,
"adv/std_final_conf": 0.7765239477157593,
"adv/std_reasoning": 0.7393073439598083,
"adv/std_step_conf": 0.860089123249054,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 14.96484375,
"calib/ece": 0.1662753036437247,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.7246963562753036,
"calib/gap": 0.3089900768245838,
"calib/mean_conf": 0.81834008097166,
"calib/mu_c": 0.9071590909090909,
"calib/mu_w": 0.5981690140845071,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1360323886639676,
"calib/std_conf": 0.3129141429580187,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3645833405358686,
"calib/step_q_c_n": 2314.0,
"calib/step_q_gap": -0.025199058936774765,
"calib/step_q_w": 0.38978239947264337,
"calib/step_q_w_n": 1517.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2141.0,
"completions/max_terminated_length": 2141.0,
"completions/mean_length": 742.91015625,
"completions/mean_terminated_length": 769.9797973632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 235.0,
"epoch": 0.18986666666666666,
"grad_norm": 1.3304215669631958,
"kl": 0.22064208984375,
"learning_rate": 6.388888888888889e-07,
"loss": -0.156,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.02037697285413742,
"mask/share_reasoning": 0.7630996704101562,
"mask/share_step_conf": 0.18136712908744812,
"num_tokens": 54097770.0,
"reward": 0.696796178817749,
"reward_std": 0.23738355934619904,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.7840775847434998,
"rewards/format_reward_step": 0.96484375,
"rewards/step_margin_reward": 0.2790459394454956,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.5894767045974731,
"adv/mean_abs_reasoning": 0.4320108890533447,
"adv/mean_abs_step_conf": 0.6031880974769592,
"adv/ratio_final_to_reasoning": 1.364495015135335,
"adv/ratio_step_to_reasoning": 1.3962335504984864,
"adv/std_final_conf": 0.8110029101371765,
"adv/std_reasoning": 0.7014033794403076,
"adv/std_step_conf": 0.8274986147880554,
"calib/answer_extract_rate": 0.95703125,
"calib/avg_num_step_conf": 14.6015625,
"calib/ece": 0.15346530612244896,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.6653061224489796,
"calib/gap": 0.39412153110047843,
"calib/mean_conf": 0.7682816326530612,
"calib/mu_c": 0.856757894736842,
"calib/mu_w": 0.4626363636363636,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.07311836734693877,
"calib/std_conf": 0.34810459117531684,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37205936091765673,
"calib/step_q_c_n": 2441.0,
"calib/step_q_gap": 0.02078179730932983,
"calib/step_q_w": 0.3512775636083269,
"calib/step_q_w_n": 1297.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2793.0,
"completions/max_terminated_length": 2793.0,
"completions/mean_length": 774.578125,
"completions/mean_terminated_length": 809.3550415039062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 312.0,
"epoch": 0.19093333333333334,
"grad_norm": 1.688877820968628,
"kl": 0.205169677734375,
"learning_rate": 6.111111111111112e-07,
"loss": -0.0878,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.019409772008657455,
"mask/share_reasoning": 0.7689638137817383,
"mask/share_step_conf": 0.16865769028663635,
"num_tokens": 54402326.0,
"reward": 0.6934785842895508,
"reward_std": 0.20034080743789673,
"rewards/accuracy_reward_step": 0.7421875,
"rewards/final_brier_reward_step": 0.8057296872138977,
"rewards/format_reward_step": 0.95703125,
"rewards/step_margin_reward": 0.2413836568593979,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.4851997494697571,
"adv/mean_abs_reasoning": 0.3911029100418091,
"adv/mean_abs_step_conf": 0.5557488203048706,
"adv/ratio_final_to_reasoning": 1.2405935548213871,
"adv/ratio_step_to_reasoning": 1.4209784842701907,
"adv/std_final_conf": 0.75730961561203,
"adv/std_reasoning": 0.6818585395812988,
"adv/std_step_conf": 0.8105658888816833,
"calib/answer_extract_rate": 0.921875,
"calib/avg_num_step_conf": 17.41015625,
"calib/ece": 0.17533898305084752,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.7076271186440678,
"calib/gap": 0.32153015985162936,
"calib/mean_conf": 0.7915169491525423,
"calib/mu_c": 0.8827988165680474,
"calib/mu_w": 0.561268656716418,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1253771186440679,
"calib/std_conf": 0.3434018433030211,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3868223106060606,
"calib/step_q_c_n": 2640.0,
"calib/step_q_gap": -0.04154697943246444,
"calib/step_q_w": 0.42836929003852503,
"calib/step_q_w_n": 1817.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 3015.0,
"completions/max_terminated_length": 3015.0,
"completions/mean_length": 861.81640625,
"completions/mean_terminated_length": 926.995849609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 323.0,
"epoch": 0.192,
"grad_norm": 1.3137576580047607,
"kl": 0.18927001953125,
"learning_rate": 5.833333333333334e-07,
"loss": -0.232,
"mask/has_final_conf_rate": 0.921875,
"mask/share_final_conf": 0.01645885594189167,
"mask/share_reasoning": 0.7454394102096558,
"mask/share_step_conf": 0.1677892506122589,
"num_tokens": 54726807.0,
"reward": 0.6496822237968445,
"reward_std": 0.1653367280960083,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7410233020782471,
"rewards/format_reward_step": 0.921875,
"rewards/step_margin_reward": 0.2419348955154419,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.5677691698074341,
"adv/mean_abs_reasoning": 0.5137337446212769,
"adv/mean_abs_step_conf": 0.6264705657958984,
"adv/ratio_final_to_reasoning": 1.1051817712032757,
"adv/ratio_step_to_reasoning": 1.2194460113920118,
"adv/std_final_conf": 0.8043436408042908,
"adv/std_reasoning": 0.7756029963493347,
"adv/std_step_conf": 0.8440066576004028,
"calib/answer_extract_rate": 0.9140625,
"calib/avg_num_step_conf": 17.52734375,
"calib/ece": 0.17845726495726488,
"calib/final_conf_rate": 0.9140625,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 0.6965811965811965,
"calib/gap": 0.3656142913541255,
"calib/mean_conf": 0.7876880341880341,
"calib/mu_c": 0.920496644295302,
"calib/mu_w": 0.5548823529411765,
"calib/nonempty_final_conf_rate": 0.9140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16469658119658115,
"calib/std_conf": 0.3390760461467229,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3883713749413421,
"calib/step_q_c_n": 2131.0,
"calib/step_q_gap": -0.09216385426069529,
"calib/step_q_w": 0.4805352292020374,
"calib/step_q_w_n": 2356.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 2851.0,
"completions/max_terminated_length": 2851.0,
"completions/mean_length": 730.82421875,
"completions/mean_terminated_length": 796.1318969726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 305.0,
"epoch": 0.19306666666666666,
"grad_norm": 1.38997483253479,
"kl": 0.220367431640625,
"learning_rate": 5.555555555555555e-07,
"loss": -0.2918,
"mask/has_final_conf_rate": 0.9140625,
"mask/share_final_conf": 0.01842355541884899,
"mask/share_reasoning": 0.731447696685791,
"mask/share_step_conf": 0.16809748113155365,
"num_tokens": 55020162.0,
"reward": 0.6222001314163208,
"reward_std": 0.24766887724399567,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7313224673271179,
"rewards/format_reward_step": 0.9140625,
"rewards/step_margin_reward": 0.21385908126831055,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.40869730710983276,
"adv/mean_abs_reasoning": 0.3939915895462036,
"adv/mean_abs_step_conf": 0.6618375778198242,
"adv/ratio_final_to_reasoning": 1.0373249530036088,
"adv/ratio_step_to_reasoning": 1.6798266647826758,
"adv/std_final_conf": 0.6826504468917847,
"adv/std_reasoning": 0.6816443800926208,
"adv/std_step_conf": 0.8758221864700317,
"calib/answer_extract_rate": 0.9609375,
"calib/avg_num_step_conf": 15.66015625,
"calib/ece": 0.17861788617886182,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.7642276422764228,
"calib/gap": 0.33253009088676,
"calib/mean_conf": 0.827520325203252,
"calib/mu_c": 0.9207909604519774,
"calib/mu_w": 0.5882608695652174,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14331300813008133,
"calib/std_conf": 0.324837904532449,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.34900135964912277,
"calib/step_q_c_n": 2280.0,
"calib/step_q_gap": -0.04738846105648736,
"calib/step_q_w": 0.39638982070561013,
"calib/step_q_w_n": 1729.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2676.0,
"completions/max_terminated_length": 2676.0,
"completions/mean_length": 775.29296875,
"completions/mean_terminated_length": 806.8088989257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 325.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.639399528503418,
"kl": 0.210113525390625,
"learning_rate": 5.277777777777779e-07,
"loss": -0.0989,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.019323285669088364,
"mask/share_reasoning": 0.7690304517745972,
"mask/share_step_conf": 0.17258381843566895,
"num_tokens": 55324797.0,
"reward": 0.7346382141113281,
"reward_std": 0.20908474922180176,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.7833744287490845,
"rewards/format_reward_step": 0.9609375,
"rewards/step_margin_reward": 0.35543322563171387,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.599501371383667,
"adv/mean_abs_reasoning": 0.5415783524513245,
"adv/mean_abs_step_conf": 0.7447316646575928,
"adv/ratio_final_to_reasoning": 1.1069522418504505,
"adv/ratio_step_to_reasoning": 1.375113427792569,
"adv/std_final_conf": 0.8279132843017578,
"adv/std_reasoning": 0.7931107878684998,
"adv/std_step_conf": 0.9066308736801147,
"calib/answer_extract_rate": 0.92578125,
"calib/avg_num_step_conf": 17.9296875,
"calib/ece": 0.1794050632911392,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.6075949367088608,
"calib/gap": 0.22172419895678075,
"calib/mean_conf": 0.7547215189873417,
"calib/mu_c": 0.8117897727272727,
"calib/mu_w": 0.590065573770492,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.09575527426160335,
"calib/std_conf": 0.3328867321729186,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.353248,
"calib/step_q_c_n": 2450.0,
"calib/step_q_gap": -0.06968190654205603,
"calib/step_q_w": 0.42292990654205603,
"calib/step_q_w_n": 2140.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2216.0,
"completions/max_terminated_length": 2216.0,
"completions/mean_length": 766.8046875,
"completions/mean_terminated_length": 828.2784423828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 271.0,
"epoch": 0.1952,
"grad_norm": 1.161199688911438,
"kl": 0.207366943359375,
"learning_rate": 5.000000000000001e-07,
"loss": -0.3313,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.01840786077082157,
"mask/share_reasoning": 0.7362696528434753,
"mask/share_step_conf": 0.17110374569892883,
"num_tokens": 55627779.0,
"reward": 0.6906335353851318,
"reward_std": 0.2569107413291931,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.7245738506317139,
"rewards/format_reward_step": 0.92578125,
"rewards/step_margin_reward": 0.3340369760990143,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.5056432485580444,
"adv/mean_abs_reasoning": 0.4590601623058319,
"adv/mean_abs_step_conf": 0.6574715375900269,
"adv/ratio_final_to_reasoning": 1.1014749047667924,
"adv/ratio_step_to_reasoning": 1.4322121403163943,
"adv/std_final_conf": 0.7579997181892395,
"adv/std_reasoning": 0.7394355535507202,
"adv/std_step_conf": 0.8758077621459961,
"calib/answer_extract_rate": 0.9296875,
"calib/avg_num_step_conf": 16.91796875,
"calib/ece": 0.12669957983193275,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.7605042016806722,
"calib/gap": 0.38538515325670486,
"calib/mean_conf": 0.832329831932773,
"calib/mu_c": 0.9262472222222221,
"calib/mu_w": 0.5408620689655173,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10136344537815126,
"calib/std_conf": 0.311800230689387,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.38726167615947926,
"calib/step_q_c_n": 2458.0,
"calib/step_q_gap": -0.03298098267661259,
"calib/step_q_w": 0.42024265883609185,
"calib/step_q_w_n": 1873.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2845.0,
"completions/max_terminated_length": 2845.0,
"completions/mean_length": 727.18359375,
"completions/mean_terminated_length": 782.1807250976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 294.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.7605681419372559,
"kl": 0.2161865234375,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.2007,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.019348185509443283,
"mask/share_reasoning": 0.7410755753517151,
"mask/share_step_conf": 0.16926376521587372,
"num_tokens": 55919218.0,
"reward": 0.7029640674591064,
"reward_std": 0.22882726788520813,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7946516275405884,
"rewards/format_reward_step": 0.9296875,
"rewards/step_margin_reward": 0.28471386432647705,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.543674647808075,
"adv/mean_abs_reasoning": 0.4470303952693939,
"adv/mean_abs_step_conf": 0.6151069402694702,
"adv/ratio_final_to_reasoning": 1.216191680837363,
"adv/ratio_step_to_reasoning": 1.375984601447041,
"adv/std_final_conf": 0.779508650302887,
"adv/std_reasoning": 0.7206876873970032,
"adv/std_step_conf": 0.844012439250946,
"calib/answer_extract_rate": 0.94921875,
"calib/avg_num_step_conf": 15.28515625,
"calib/ece": 0.1368724279835391,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.691358024691358,
"calib/gap": 0.4064119047619048,
"calib/mean_conf": 0.7872427983539095,
"calib/mu_c": 0.9126785714285715,
"calib/mu_w": 0.5062666666666666,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.11637860082304528,
"calib/std_conf": 0.3335376082403418,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.34429794159885113,
"calib/step_q_c_n": 2089.0,
"calib/step_q_gap": -0.043456828137990955,
"calib/step_q_w": 0.3877547697368421,
"calib/step_q_w_n": 1824.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2098.0,
"completions/max_terminated_length": 2098.0,
"completions/mean_length": 749.9453125,
"completions/mean_terminated_length": 790.0657958984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 365.0,
"epoch": 0.19733333333333333,
"grad_norm": 1.293918490409851,
"kl": 0.20623779296875,
"learning_rate": 4.444444444444445e-07,
"loss": -0.2047,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.01943805254995823,
"mask/share_reasoning": 0.7589957118034363,
"mask/share_step_conf": 0.17078498005867004,
"num_tokens": 56218124.0,
"reward": 0.6847925186157227,
"reward_std": 0.23087607324123383,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7969818115234375,
"rewards/format_reward_step": 0.94921875,
"rewards/step_margin_reward": 0.25150930881500244,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.4829450845718384,
"adv/mean_abs_reasoning": 0.45892712473869324,
"adv/mean_abs_step_conf": 0.5779283046722412,
"adv/ratio_final_to_reasoning": 1.0523350190878795,
"adv/ratio_step_to_reasoning": 1.2593029993624927,
"adv/std_final_conf": 0.758141815662384,
"adv/std_reasoning": 0.7208572626113892,
"adv/std_step_conf": 0.8272896409034729,
"calib/answer_extract_rate": 0.8515625,
"calib/avg_num_step_conf": 20.10546875,
"calib/ece": 0.05591743119266059,
"calib/final_conf_rate": 0.8515625,
"calib/format_rate": 0.8515625,
"calib/frac_conf_gt_0.9": 0.7385321100917431,
"calib/gap": 0.5937821519140198,
"calib/mean_conf": 0.8089908256880735,
"calib/mu_c": 0.942455621301775,
"calib/mu_w": 0.3486734693877551,
"calib/nonempty_final_conf_rate": 0.8515625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.044839449541284444,
"calib/std_conf": 0.3308272781818732,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37102044943820217,
"calib/step_q_c_n": 2225.0,
"calib/step_q_gap": -0.0818712343400319,
"calib/step_q_w": 0.4528916837782341,
"calib/step_q_w_n": 2922.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.14453125,
"completions/max_length": 2049.0,
"completions/max_terminated_length": 2049.0,
"completions/mean_length": 675.3046875,
"completions/mean_terminated_length": 789.397216796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 235.0,
"epoch": 0.1984,
"grad_norm": 1.1111749410629272,
"kl": 0.2142333984375,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.4156,
"mask/has_final_conf_rate": 0.8515625,
"mask/share_final_conf": 0.017488744109869003,
"mask/share_reasoning": 0.6818581819534302,
"mask/share_step_conf": 0.15612183511257172,
"num_tokens": 56496042.0,
"reward": 0.6668155193328857,
"reward_std": 0.21297681331634521,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7852226495742798,
"rewards/format_reward_step": 0.8515625,
"rewards/step_margin_reward": 0.24606461822986603,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.6033467054367065,
"adv/mean_abs_reasoning": 0.5293259620666504,
"adv/mean_abs_step_conf": 0.7025371193885803,
"adv/ratio_final_to_reasoning": 1.1398396237378128,
"adv/ratio_step_to_reasoning": 1.3272296651493545,
"adv/std_final_conf": 0.8274721503257751,
"adv/std_reasoning": 0.7756878137588501,
"adv/std_step_conf": 0.906362771987915,
"calib/answer_extract_rate": 0.8984375,
"calib/avg_num_step_conf": 19.078125,
"calib/ece": 0.16744347826086953,
"calib/final_conf_rate": 0.8984375,
"calib/format_rate": 0.89453125,
"calib/frac_conf_gt_0.9": 0.5826086956521739,
"calib/gap": 0.31883709806005844,
"calib/mean_conf": 0.7278782608695651,
"calib/mu_c": 0.8263018867924529,
"calib/mu_w": 0.5074647887323944,
"calib/nonempty_final_conf_rate": 0.8984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10200869565217387,
"calib/std_conf": 0.3486890267179298,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.36826494481236205,
"calib/step_q_c_n": 2265.0,
"calib/step_q_gap": -0.021178501882326506,
"calib/step_q_w": 0.38944344669468856,
"calib/step_q_w_n": 2617.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 2964.0,
"completions/max_terminated_length": 2964.0,
"completions/mean_length": 805.78515625,
"completions/mean_terminated_length": 892.9913330078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 268.0,
"epoch": 0.19946666666666665,
"grad_norm": 1.214120626449585,
"kl": 0.195098876953125,
"learning_rate": 3.8888888888888895e-07,
"loss": -0.3161,
"mask/has_final_conf_rate": 0.8984375,
"mask/share_final_conf": 0.01709054782986641,
"mask/share_reasoning": 0.7231029272079468,
"mask/share_step_conf": 0.16215023398399353,
"num_tokens": 56803867.0,
"reward": 0.6590248346328735,
"reward_std": 0.2382904291152954,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7168227434158325,
"rewards/format_reward_step": 0.89453125,
"rewards/step_margin_reward": 0.29810184240341187,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.4895247220993042,
"adv/mean_abs_reasoning": 0.4144752323627472,
"adv/mean_abs_step_conf": 0.7287554740905762,
"adv/ratio_final_to_reasoning": 1.1810711084199936,
"adv/ratio_step_to_reasoning": 1.758260608085677,
"adv/std_final_conf": 0.7579102516174316,
"adv/std_reasoning": 0.7014322876930237,
"adv/std_step_conf": 0.9215178489685059,
"calib/answer_extract_rate": 0.93359375,
"calib/avg_num_step_conf": 17.296875,
"calib/ece": 0.10073221757322179,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.7322175732217573,
"calib/gap": 0.48930679702048424,
"calib/mean_conf": 0.8173849372384938,
"calib/mu_c": 0.9402234636871509,
"calib/mu_w": 0.4509166666666667,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08458158995815904,
"calib/std_conf": 0.32389517769325593,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.35254701244813275,
"calib/step_q_c_n": 2410.0,
"calib/step_q_gap": -0.008747832396712063,
"calib/step_q_w": 0.3612948448448448,
"calib/step_q_w_n": 1998.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2585.0,
"completions/max_terminated_length": 2585.0,
"completions/mean_length": 805.2421875,
"completions/mean_terminated_length": 855.3610229492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 300.0,
"epoch": 0.20053333333333334,
"grad_norm": 1.4672449827194214,
"kl": 0.20538330078125,
"learning_rate": 3.611111111111111e-07,
"loss": -0.1776,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.018879268318414688,
"mask/share_reasoning": 0.7498528361320496,
"mask/share_step_conf": 0.17267413437366486,
"num_tokens": 57114081.0,
"reward": 0.7526510953903198,
"reward_std": 0.21485695242881775,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.823620080947876,
"rewards/format_reward_step": 0.9296875,
"rewards/step_margin_reward": 0.35590076446533203,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.5508660078048706,
"adv/mean_abs_reasoning": 0.41227346658706665,
"adv/mean_abs_step_conf": 0.5993289947509766,
"adv/ratio_final_to_reasoning": 1.3361665313198978,
"adv/ratio_step_to_reasoning": 1.4537171157591493,
"adv/std_final_conf": 0.7943575382232666,
"adv/std_reasoning": 0.681851327419281,
"adv/std_step_conf": 0.8275741338729858,
"calib/answer_extract_rate": 0.9375,
"calib/avg_num_step_conf": 15.71875,
"calib/ece": 0.17445833333333333,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.6166666666666667,
"calib/gap": 0.3424080104200586,
"calib/mean_conf": 0.7287916666666666,
"calib/mu_c": 0.834367469879518,
"calib/mu_w": 0.49195945945945946,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.10579166666666666,
"calib/std_conf": 0.3626002710936965,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37252279635258356,
"calib/step_q_c_n": 1974.0,
"calib/step_q_gap": -0.027345496330343277,
"calib/step_q_w": 0.39986829268292684,
"calib/step_q_w_n": 2050.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2276.0,
"completions/max_terminated_length": 2276.0,
"completions/mean_length": 697.14453125,
"completions/mean_terminated_length": 743.620849609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 268.0,
"epoch": 0.2016,
"grad_norm": 1.2404696941375732,
"kl": 0.227996826171875,
"learning_rate": 3.3333333333333335e-07,
"loss": -0.1732,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.02052876353263855,
"mask/share_reasoning": 0.7510529160499573,
"mask/share_step_conf": 0.16591832041740417,
"num_tokens": 57400318.0,
"reward": 0.6404236555099487,
"reward_std": 0.18784648180007935,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7499300837516785,
"rewards/format_reward_step": 0.9375,
"rewards/step_margin_reward": 0.21372976899147034,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.5000724792480469,
"adv/mean_abs_reasoning": 0.46006476879119873,
"adv/mean_abs_step_conf": 0.6737987995147705,
"adv/ratio_final_to_reasoning": 1.0869610393379323,
"adv/ratio_step_to_reasoning": 1.4645737844372417,
"adv/std_final_conf": 0.7748420238494873,
"adv/std_reasoning": 0.7394383549690247,
"adv/std_step_conf": 0.8910828232765198,
"calib/answer_extract_rate": 0.9140625,
"calib/avg_num_step_conf": 18.03515625,
"calib/ece": 0.13979059829059842,
"calib/final_conf_rate": 0.9140625,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.7478632478632479,
"calib/gap": 0.42944550327575937,
"calib/mean_conf": 0.8211581196581197,
"calib/mu_c": 0.9551304347826087,
"calib/mu_w": 0.5256849315068494,
"calib/nonempty_final_conf_rate": 0.9140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1364572649572651,
"calib/std_conf": 0.32836119333558894,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.3299528905560459,
"calib/step_q_c_n": 2266.0,
"calib/step_q_gap": -0.0427577006817253,
"calib/step_q_w": 0.37271059123777117,
"calib/step_q_w_n": 2351.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2932.0,
"completions/max_terminated_length": 2932.0,
"completions/mean_length": 811.3125,
"completions/mean_terminated_length": 880.0678100585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 279.0,
"epoch": 0.20266666666666666,
"grad_norm": 1.3556324243545532,
"kl": 0.188079833984375,
"learning_rate": 3.055555555555556e-07,
"loss": -0.2387,
"mask/has_final_conf_rate": 0.9140625,
"mask/share_final_conf": 0.017239660024642944,
"mask/share_reasoning": 0.7378236055374146,
"mask/share_step_conf": 0.1668117344379425,
"num_tokens": 57713622.0,
"reward": 0.6745651960372925,
"reward_std": 0.23061785101890564,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7677172422409058,
"rewards/format_reward_step": 0.91015625,
"rewards/step_margin_reward": 0.27360066771507263,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.4767415523529053,
"adv/mean_abs_reasoning": 0.4095577001571655,
"adv/mean_abs_step_conf": 0.5481799244880676,
"adv/ratio_final_to_reasoning": 1.1640400172428897,
"adv/ratio_step_to_reasoning": 1.3384681188455414,
"adv/std_final_conf": 0.7403095960617065,
"adv/std_reasoning": 0.681888997554779,
"adv/std_step_conf": 0.7935493588447571,
"calib/answer_extract_rate": 0.8984375,
"calib/avg_num_step_conf": 18.0859375,
"calib/ece": 0.18001739130434788,
"calib/final_conf_rate": 0.8984375,
"calib/format_rate": 0.8984375,
"calib/frac_conf_gt_0.9": 0.6869565217391305,
"calib/gap": 0.3283962217250809,
"calib/mean_conf": 0.7876695652173913,
"calib/mu_c": 0.9033221476510067,
"calib/mu_w": 0.5749259259259258,
"calib/nonempty_final_conf_rate": 0.8984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15993043478260874,
"calib/std_conf": 0.33018642222470335,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3850604444444444,
"calib/step_q_c_n": 2025.0,
"calib/step_q_gap": -0.061241666879931766,
"calib/step_q_w": 0.4463021113243762,
"calib/step_q_w_n": 2605.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2985.0,
"completions/max_terminated_length": 2985.0,
"completions/mean_length": 720.7890625,
"completions/mean_terminated_length": 795.3534545898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 280.0,
"epoch": 0.20373333333333332,
"grad_norm": 1.053230881690979,
"kl": 0.2259521484375,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.312,
"mask/has_final_conf_rate": 0.8984375,
"mask/share_final_conf": 0.019232170656323433,
"mask/share_reasoning": 0.7079811096191406,
"mask/share_step_conf": 0.1790367066860199,
"num_tokens": 58002312.0,
"reward": 0.6380224227905273,
"reward_std": 0.1949125975370407,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7125676870346069,
"rewards/format_reward_step": 0.8984375,
"rewards/step_margin_reward": 0.2673834264278412,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.4584586024284363,
"adv/mean_abs_reasoning": 0.3656942844390869,
"adv/mean_abs_step_conf": 0.6547200679779053,
"adv/ratio_final_to_reasoning": 1.2536663052626982,
"adv/ratio_step_to_reasoning": 1.7903481017816152,
"adv/std_final_conf": 0.7212614417076111,
"adv/std_reasoning": 0.6406073570251465,
"adv/std_step_conf": 0.843872606754303,
"calib/answer_extract_rate": 0.90625,
"calib/avg_num_step_conf": 16.19140625,
"calib/ece": 0.07082521551724141,
"calib/final_conf_rate": 0.90625,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.6508620689655172,
"calib/gap": 0.5387381102756892,
"calib/mean_conf": 0.7644459051724138,
"calib/mu_c": 0.8968082857142857,
"calib/mu_w": 0.3580701754385965,
"calib/nonempty_final_conf_rate": 0.90625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.04048038793103452,
"calib/std_conf": 0.34546469530939794,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3580890366972477,
"calib/step_q_c_n": 2180.0,
"calib/step_q_gap": -0.06594709561827389,
"calib/step_q_w": 0.42403613231552156,
"calib/step_q_w_n": 1965.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 3025.0,
"completions/max_terminated_length": 3025.0,
"completions/mean_length": 715.5,
"completions/mean_terminated_length": 782.769287109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.2048,
"grad_norm": 1.5645737648010254,
"kl": 0.224853515625,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.3583,
"mask/has_final_conf_rate": 0.90625,
"mask/share_final_conf": 0.020545516163110733,
"mask/share_reasoning": 0.7284271121025085,
"mask/share_step_conf": 0.16508983075618744,
"num_tokens": 58290456.0,
"reward": 0.7366434335708618,
"reward_std": 0.20213548839092255,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.8110120296478271,
"rewards/format_reward_step": 0.90625,
"rewards/step_margin_reward": 0.34430617094039917,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.5915266275405884,
"adv/mean_abs_reasoning": 0.5218731164932251,
"adv/mean_abs_step_conf": 0.6205939054489136,
"adv/ratio_final_to_reasoning": 1.1334682873021062,
"adv/ratio_step_to_reasoning": 1.1891662663504339,
"adv/std_final_conf": 0.8266447186470032,
"adv/std_reasoning": 0.7755892276763916,
"adv/std_step_conf": 0.8439964056015015,
"calib/answer_extract_rate": 0.90625,
"calib/avg_num_step_conf": 17.828125,
"calib/ece": 0.21960729613733918,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.6437768240343348,
"calib/gap": 0.2658452380952381,
"calib/mean_conf": 0.7656587982832619,
"calib/mu_c": 0.8614999999999999,
"calib/mu_w": 0.5956547619047619,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17289055793991426,
"calib/std_conf": 0.3415928990606869,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3481242957746478,
"calib/step_q_c_n": 1988.0,
"calib/step_q_gap": -0.06758511157524866,
"calib/step_q_w": 0.4157094073498965,
"calib/step_q_w_n": 2576.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 2965.0,
"completions/max_terminated_length": 2965.0,
"completions/mean_length": 748.2890625,
"completions/mean_terminated_length": 815.1574096679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 378.0,
"epoch": 0.20586666666666667,
"grad_norm": 1.8033738136291504,
"kl": 0.205810546875,
"learning_rate": 2.2222222222222224e-07,
"loss": -0.2309,
"mask/has_final_conf_rate": 0.91015625,
"mask/share_final_conf": 0.018781986087560654,
"mask/share_reasoning": 0.7305821180343628,
"mask/share_step_conf": 0.16860461235046387,
"num_tokens": 58587730.0,
"reward": 0.6168700456619263,
"reward_std": 0.24369192123413086,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6872923374176025,
"rewards/format_reward_step": 0.90625,
"rewards/step_margin_reward": 0.24879151582717896,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.46990489959716797,
"adv/mean_abs_reasoning": 0.3474540710449219,
"adv/mean_abs_step_conf": 0.5742833614349365,
"adv/ratio_final_to_reasoning": 1.3524230646772724,
"adv/ratio_step_to_reasoning": 1.6528324440345619,
"adv/std_final_conf": 0.7213148474693298,
"adv/std_reasoning": 0.6403520107269287,
"adv/std_step_conf": 0.8269177675247192,
"calib/answer_extract_rate": 0.9375,
"calib/avg_num_step_conf": 17.01953125,
"calib/ece": 0.12043750000000003,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.65,
"calib/gap": 0.46018681318681304,
"calib/mean_conf": 0.7449375000000001,
"calib/mu_c": 0.8695714285714284,
"calib/mu_w": 0.4093846153846154,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.0681041666666667,
"calib/std_conf": 0.3680022648396112,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.372512690134724,
"calib/step_q_c_n": 2301.0,
"calib/step_q_gap": -0.026359391577338243,
"calib/step_q_w": 0.39887208171206223,
"calib/step_q_w_n": 2056.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2569.0,
"completions/max_terminated_length": 2569.0,
"completions/mean_length": 732.6328125,
"completions/mean_terminated_length": 781.4750366210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 359.0,
"epoch": 0.20693333333333333,
"grad_norm": 1.3977628946304321,
"kl": 0.2171630859375,
"learning_rate": 1.9444444444444447e-07,
"loss": -0.1234,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.019067011773586273,
"mask/share_reasoning": 0.7472081184387207,
"mask/share_step_conf": 0.17122486233711243,
"num_tokens": 58881228.0,
"reward": 0.6840992569923401,
"reward_std": 0.15268655121326447,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.795563280582428,
"rewards/format_reward_step": 0.9375,
"rewards/step_margin_reward": 0.2484164834022522,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.5955495238304138,
"adv/mean_abs_reasoning": 0.48984187841415405,
"adv/mean_abs_step_conf": 0.5682377815246582,
"adv/ratio_final_to_reasoning": 1.2157995264890062,
"adv/ratio_step_to_reasoning": 1.1600432845070499,
"adv/std_final_conf": 0.8277072310447693,
"adv/std_reasoning": 0.7394471764564514,
"adv/std_step_conf": 0.8262815475463867,
"calib/answer_extract_rate": 0.94140625,
"calib/avg_num_step_conf": 17.23828125,
"calib/ece": 0.1139792531120332,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.5933609958506224,
"calib/gap": 0.44922449762233474,
"calib/mean_conf": 0.7133153526970955,
"calib/mu_c": 0.8661635220125786,
"calib/mu_w": 0.4169390243902439,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.08377178423236517,
"calib/std_conf": 0.36904471292478735,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.38244337016574587,
"calib/step_q_c_n": 2172.0,
"calib/step_q_gap": 0.005894284935937744,
"calib/step_q_w": 0.3765490852298081,
"calib/step_q_w_n": 2241.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 2583.0,
"completions/max_terminated_length": 2583.0,
"completions/mean_length": 781.796875,
"completions/mean_terminated_length": 830.4564819335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 285.0,
"epoch": 0.208,
"grad_norm": 1.2796424627304077,
"kl": 0.21484375,
"learning_rate": 1.6666666666666668e-07,
"loss": -0.163,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.018959684297442436,
"mask/share_reasoning": 0.7448201179504395,
"mask/share_step_conf": 0.17762643098831177,
"num_tokens": 59187352.0,
"reward": 0.6303240060806274,
"reward_std": 0.20820724964141846,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.789030909538269,
"rewards/format_reward_step": 0.94140625,
"rewards/step_margin_reward": 0.1591169536113739,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.37481117248535156,
"adv/mean_abs_reasoning": 0.27760109305381775,
"adv/mean_abs_step_conf": 0.682806670665741,
"adv/ratio_final_to_reasoning": 1.350179022575707,
"adv/ratio_step_to_reasoning": 2.4596685234714375,
"adv/std_final_conf": 0.644271969795227,
"adv/std_reasoning": 0.5726718902587891,
"adv/std_step_conf": 0.8913529515266418,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 13.58203125,
"calib/ece": 0.1978167330677291,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7729083665338645,
"calib/gap": 0.35664941545480466,
"calib/mean_conf": 0.8557450199203187,
"calib/mu_c": 0.9751017964071855,
"calib/mu_w": 0.6184523809523809,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19411155378486056,
"calib/std_conf": 0.2826524457786999,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.38418827519379845,
"calib/step_q_c_n": 2064.0,
"calib/step_q_gap": 0.02467730562550402,
"calib/step_q_w": 0.3595109695682944,
"calib/step_q_w_n": 1413.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1955.0,
"completions/max_terminated_length": 1955.0,
"completions/mean_length": 668.58984375,
"completions/mean_terminated_length": 681.9083862304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 261.0,
"epoch": 0.20906666666666668,
"grad_norm": 1.3142296075820923,
"kl": 0.24029541015625,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.0081,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.022626737132668495,
"mask/share_reasoning": 0.7662988901138306,
"mask/share_step_conf": 0.19154314696788788,
"num_tokens": 59461055.0,
"reward": 0.7016684412956238,
"reward_std": 0.1755383312702179,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.8039993047714233,
"rewards/format_reward_step": 0.98046875,
"rewards/step_margin_reward": 0.27277499437332153,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.6367141008377075,
"adv/mean_abs_reasoning": 0.538066565990448,
"adv/mean_abs_step_conf": 0.7363102436065674,
"adv/ratio_final_to_reasoning": 1.1833370461620742,
"adv/ratio_step_to_reasoning": 1.3684370859415909,
"adv/std_final_conf": 0.8445119261741638,
"adv/std_reasoning": 0.7930350303649902,
"adv/std_step_conf": 0.9213052988052368,
"calib/answer_extract_rate": 0.91015625,
"calib/avg_num_step_conf": 16.94140625,
"calib/ece": 0.19726609442060083,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.648068669527897,
"calib/gap": 0.29037785947712436,
"calib/mean_conf": 0.7663648068669527,
"calib/mu_c": 0.8660653594771243,
"calib/mu_w": 0.5756874999999999,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15348927038626609,
"calib/std_conf": 0.3433324318520512,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.35348792508624943,
"calib/step_q_c_n": 2029.0,
"calib/step_q_gap": -0.037050419801098966,
"calib/step_q_w": 0.3905383448873484,
"calib/step_q_w_n": 2308.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 2868.0,
"completions/max_terminated_length": 2868.0,
"completions/mean_length": 719.6328125,
"completions/mean_terminated_length": 783.9404296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 268.0,
"epoch": 0.21013333333333334,
"grad_norm": 1.5192164182662964,
"kl": 0.218231201171875,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.2241,
"mask/has_final_conf_rate": 0.91015625,
"mask/share_final_conf": 0.018830955028533936,
"mask/share_reasoning": 0.7327144145965576,
"mask/share_step_conf": 0.16642341017723083,
"num_tokens": 59750337.0,
"reward": 0.65471351146698,
"reward_std": 0.2487749457359314,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7058836817741394,
"rewards/format_reward_step": 0.91015625,
"rewards/step_margin_reward": 0.30198079347610474,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.4821930527687073,
"adv/mean_abs_reasoning": 0.4194025695323944,
"adv/mean_abs_step_conf": 0.7058519124984741,
"adv/ratio_final_to_reasoning": 1.1497141119243022,
"adv/ratio_step_to_reasoning": 1.6829937720349482,
"adv/std_final_conf": 0.740700900554657,
"adv/std_reasoning": 0.7016922831535339,
"adv/std_step_conf": 0.891240119934082,
"calib/answer_extract_rate": 0.91015625,
"calib/avg_num_step_conf": 17.49609375,
"calib/ece": 0.07405150214592279,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.5836909871244635,
"calib/gap": 0.5530430402930404,
"calib/mean_conf": 0.7119914163090129,
"calib/mu_c": 0.8662738095238095,
"calib/mu_w": 0.31323076923076915,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03250643776824039,
"calib/std_conf": 0.3618361465111361,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.37521608040201004,
"calib/step_q_c_n": 2189.0,
"calib/step_q_gap": -0.013832784226810946,
"calib/step_q_w": 0.389048864628821,
"calib/step_q_w_n": 2290.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 3020.0,
"completions/max_terminated_length": 3020.0,
"completions/mean_length": 718.3984375,
"completions/mean_terminated_length": 785.940185546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 253.0,
"epoch": 0.2112,
"grad_norm": 1.0711225271224976,
"kl": 0.2114105224609375,
"learning_rate": 8.333333333333334e-08,
"loss": -0.2661,
"mask/has_final_conf_rate": 0.91015625,
"mask/share_final_conf": 0.019884463399648666,
"mask/share_reasoning": 0.7254239320755005,
"mask/share_step_conf": 0.16875414550304413,
"num_tokens": 60039631.0,
"reward": 0.681763768196106,
"reward_std": 0.21229125559329987,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.810340940952301,
"rewards/format_reward_step": 0.91015625,
"rewards/step_margin_reward": 0.239905446767807,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.5740118026733398,
"adv/mean_abs_reasoning": 0.4569406807422638,
"adv/mean_abs_step_conf": 0.6765886545181274,
"adv/ratio_final_to_reasoning": 1.2562063892864679,
"adv/ratio_step_to_reasoning": 1.4806925341360786,
"adv/std_final_conf": 0.8104844689369202,
"adv/std_reasoning": 0.7208795547485352,
"adv/std_step_conf": 0.8912445902824402,
"calib/answer_extract_rate": 0.921875,
"calib/avg_num_step_conf": 17.36328125,
"calib/ece": 0.18282838983050834,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.6991525423728814,
"calib/gap": 0.26619367201426025,
"calib/mean_conf": 0.8040148305084746,
"calib/mu_c": 0.8784588235294117,
"calib/mu_w": 0.6122651515151515,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.13325211864406766,
"calib/std_conf": 0.3146852725416814,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.3475078777442962,
"calib/step_q_c_n": 2323.0,
"calib/step_q_gap": -0.057662951661924366,
"calib/step_q_w": 0.40517082940622057,
"calib/step_q_w_n": 2122.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2059.0,
"completions/max_terminated_length": 2059.0,
"completions/mean_length": 740.796875,
"completions/mean_terminated_length": 803.5762939453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 239.0,
"epoch": 0.21226666666666666,
"grad_norm": 1.425162672996521,
"kl": 0.22003173828125,
"learning_rate": 5.555555555555556e-08,
"loss": -0.3178,
"mask/has_final_conf_rate": 0.921875,
"mask/share_final_conf": 0.01996612176299095,
"mask/share_reasoning": 0.7292059659957886,
"mask/share_step_conf": 0.17270290851593018,
"num_tokens": 60333475.0,
"reward": 0.6757093667984009,
"reward_std": 0.24863620102405548,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7372885942459106,
"rewards/format_reward_step": 0.921875,
"rewards/step_margin_reward": 0.29694265127182007,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.4264031648635864,
"adv/mean_abs_reasoning": 0.29352903366088867,
"adv/mean_abs_step_conf": 0.47116726636886597,
"adv/ratio_final_to_reasoning": 1.4526779839987003,
"adv/ratio_step_to_reasoning": 1.6051811314624538,
"adv/std_final_conf": 0.7020553350448608,
"adv/std_reasoning": 0.5962840914726257,
"adv/std_step_conf": 0.7582021355628967,
"calib/answer_extract_rate": 0.90234375,
"calib/avg_num_step_conf": 18.41015625,
"calib/ece": 0.05049134199134204,
"calib/final_conf_rate": 0.90234375,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 0.645021645021645,
"calib/gap": 0.6014358986992512,
"calib/mean_conf": 0.764517316017316,
"calib/mu_c": 0.9181308139534884,
"calib/mu_w": 0.31669491525423726,
"calib/nonempty_final_conf_rate": 0.90234375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.03520995670995676,
"calib/std_conf": 0.34323008848227377,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.34493751724137933,
"calib/step_q_c_n": 2175.0,
"calib/step_q_gap": -0.04065893665932985,
"calib/step_q_w": 0.3855964539007092,
"calib/step_q_w_n": 2538.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 1964.0,
"completions/max_terminated_length": 1964.0,
"completions/mean_length": 724.07421875,
"completions/mean_terminated_length": 802.437255859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.21333333333333335,
"grad_norm": 1.2463358640670776,
"kl": 0.20904541015625,
"learning_rate": 2.777777777777778e-08,
"loss": -0.287,
"mask/has_final_conf_rate": 0.90234375,
"mask/share_final_conf": 0.018621867522597313,
"mask/share_reasoning": 0.7213960886001587,
"mask/share_step_conf": 0.16232578456401825,
"num_tokens": 60626886.0,
"reward": 0.7203070521354675,
"reward_std": 0.15783384442329407,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.8304967880249023,
"rewards/format_reward_step": 0.90234375,
"rewards/step_margin_reward": 0.2952735424041748,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.13025389663525858,
"train_runtime": 14198.202,
"train_samples_per_second": 3.606,
"train_steps_per_second": 0.014
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 60626886,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}